Module: HtmlToPlainText
- Included in:
- Premailer
- Defined in:
- lib/premailer/html_to_plain_text.rb
Overview
Support functions for Premailer
Instance Method Summary collapse
-
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed.
Instance Method Details
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed
TODO:
- add support for DL, OL
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
# File 'lib/premailer/html_to_plain_text.rb', line 11 def convert_to_text(html, line_length = 65, from_charset = 'UTF-8') r = Text::Reform.new(:trim => true, :squeeze => false, :break => Text::Reform.break_wrap) txt = html # decode HTML entities he = HTMLEntities.new txt = he.decode(txt) # handle headings (H1-H6) txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s| hlevel = $1.to_i htext = $2 htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s htext.gsub!(/<\/?[^>]*>/i, '') # strip tags # determine maximum line length hlength = 0 htext.each { |l| llength = l.strip.length; hlength = llength if llength > hlength } hlength = line_length if hlength > line_length case hlevel when 1 # H1, asterisks above and below htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength) when 2 # H1, dashes above and below htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength) else # H3-H6, dashes below htext = htext + "\n" + ('-' * hlength) end "\n\n" + htext + "\n\n" end # links txt.gsub!(/<a.*href=\"([^\"]*)\"[^>]*>(.*)<\/a>/i) do |s| $2.strip + ' ( ' + $1.strip + ' )' end # lists -- TODO: should handle ordered lists txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ') # list not followed by a newline txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n") # paragraphs and line breaks txt.gsub!(/<\/p>/i, "\n\n") txt.gsub!(/<br[\/ ]*>/i, "\n") # strip remaining tags txt.gsub!(/<\/?[^>]*>/, '') # wrap text txt = r.format(('[' * line_length), txt) # remove linefeeds (\r\n and \r -> \n) txt.gsub!(/\r\n?/, "\n") # strip extra spaces txt.gsub!(/\302\240+/, " ") # non-breaking spaces -> spaces txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines # no more than two consecutive newlines txt.gsub!(/[\n]{3,}/, "\n\n") txt.strip end |