Module: HtmlToPlainText

Included in:
Premailer
Defined in:
lib/premailer/html_to_plain_text.rb

Overview

Support functions for Premailer

Instance Method Summary collapse

Instance Method Details

#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object

Returns the text in UTF-8 format with all HTML tags removed

TODO:

- add support for DL, OL


11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/premailer/html_to_plain_text.rb', line 11

def convert_to_text(html, line_length = 65, from_charset = 'UTF-8')
  r = Text::Reform.new(:trim => true, 
                       :squeeze => false, 
                       :break => Text::Reform.break_wrap)

  txt = html
  
  # decode HTML entities
  he = HTMLEntities.new
  txt = he.decode(txt)

  # handle headings (H1-H6)
  txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines
  txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s|
    hlevel = $1.to_i

    htext = $2      
    htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s
    htext.gsub!(/<\/?[^>]*>/i, '') # strip tags

    # determine maximum line length
    hlength = 0
    htext.each { |l| llength = l.strip.length; hlength = llength if llength > hlength }
    hlength = line_length if hlength > line_length

    case hlevel
      when 1   # H1, asterisks above and below
        htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength)
      when 2   # H1, dashes above and below
        htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength)
      else     # H3-H6, dashes below
        htext = htext + "\n" + ('-' * hlength)
    end
    
    "\n\n" + htext + "\n\n"
  end

  # links
  txt.gsub!(/<a.*href=\"([^\"]*)\"[^>]*>(.*)<\/a>/i) do |s|
    $2.strip + ' ( ' + $1.strip + ' )'
  end

  # lists -- TODO: should handle ordered lists
  txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ')
  # list not followed by a newline
  txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n")
  
  # paragraphs and line breaks
  txt.gsub!(/<\/p>/i, "\n\n")
  txt.gsub!(/<br[\/ ]*>/i, "\n")
  
  # strip remaining tags
  txt.gsub!(/<\/?[^>]*>/, '')

  # wrap text
  txt = r.format(('[' * line_length), txt)
  
  # remove linefeeds (\r\n and \r -> \n)
  txt.gsub!(/\r\n?/, "\n")
  
  # strip extra spaces
  txt.gsub!(/\302\240+/, " ") # non-breaking spaces -> spaces
  txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines
  txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines

  # no more than two consecutive newlines
  txt.gsub!(/[\n]{3,}/, "\n\n")

  txt.strip
end