Module: HtmlToPlainText
- Included in:
- Premailer
- Defined in:
- lib/premailer/html_to_plain_text.rb
Overview
Support functions for Premailer
Instance Method Summary collapse
-
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed.
-
#word_wrap(txt, line_length) ⇒ Object
Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap).
Instance Method Details
#convert_to_text(html, line_length = 65, from_charset = 'UTF-8') ⇒ Object
Returns the text in UTF-8 format with all HTML tags removed
TODO: add support for DL, OL
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
# File 'lib/premailer/html_to_plain_text.rb', line 10 def convert_to_text(html, line_length = 65, from_charset = 'UTF-8') txt = html # decode HTML entities he = HTMLEntities.new txt = he.decode(txt) # replace image by their alt attribute txt.gsub!(/<img.+?alt=\"([^\"]*)\"[^>]*\/>/i, '\1') txt.gsub!(/<img.+?alt='([^\']*)\'[^>]*\/>/i, '\1') # links txt.gsub!(/<a.+?href=\"(mailto:)?([^\"]*)\"[^>]*>((.|\s)+?)<\/a>/i) do |s| $3.strip + ' ( ' + $2.strip + ' )' end txt.gsub!(/<a.+?href='(mailto:)?([^\']*)\'[^>]*>((.|\s)+?)<\/a>/i) do |s| $3.strip + ' ( ' + $2.strip + ' )' end # handle headings (H1-H6) txt.gsub!(/(<\/h[1-6]>)/i, "\n\\1") # move closing tags to new lines txt.gsub!(/[\s]*<h([1-6]+)[^>]*>[\s]*(.*)[\s]*<\/h[1-6]+>/i) do |s| hlevel = $1.to_i htext = $2 htext.gsub!(/<br[\s]*\/?>/i, "\n") # handle <br>s htext.gsub!(/<\/?[^>]*>/i, '') # strip tags # determine maximum line length hlength = 0 htext.each_line { |l| llength = l.strip.length; hlength = llength if llength > hlength } hlength = line_length if hlength > line_length case hlevel when 1 # H1, asterisks above and below htext = ('*' * hlength) + "\n" + htext + "\n" + ('*' * hlength) when 2 # H1, dashes above and below htext = ('-' * hlength) + "\n" + htext + "\n" + ('-' * hlength) else # H3-H6, dashes below htext = htext + "\n" + ('-' * hlength) end "\n\n" + htext + "\n\n" end # wrap spans txt.gsub!(/(<\/span>)[\s]+(<span)/mi, '\1 \2') # lists -- TODO: should handle ordered lists txt.gsub!(/[\s]*(<li[^>]*>)[\s]*/i, '* ') # list not followed by a newline txt.gsub!(/<\/li>[\s]*(?![\n])/i, "\n") # paragraphs and line breaks txt.gsub!(/<\/p>/i, "\n\n") txt.gsub!(/<br[\/ ]*>/i, "\n") # strip remaining tags txt.gsub!(/<\/?[^>]*>/, '') txt = word_wrap(txt, line_length) # remove linefeeds (\r\n and \r -> \n) txt.gsub!(/\r\n?/, "\n") # strip extra spaces txt.gsub!(/\302\240+/, " ") # non-breaking spaces -> spaces txt.gsub!(/\n[ \t]+/, "\n") # space at start of lines txt.gsub!(/[ \t]+\n/, "\n") # space at end of lines # no more than two consecutive newlines txt.gsub!(/[\n]{3,}/, "\n\n") # no more than two consecutive spaces txt.gsub!(/ {2,}/, " ") # the word messes up the parens txt.gsub!(/\([ \n](http[^)]+)[\n ]\)/) do |s| "( " + $1 + " )" end txt.strip end |
#word_wrap(txt, line_length) ⇒ Object
Taken from Rails' word_wrap helper (http://api.rubyonrails.org/classes/ActionView/Helpers/TextHelper.html#method-i-word_wrap)
97 98 99 100 101 |
# File 'lib/premailer/html_to_plain_text.rb', line 97 def word_wrap(txt, line_length) txt.split("\n").collect do |line| line.length > line_length ? line.gsub(/(.{1,#{line_length}})(\s+|$)/, "\\1\n").strip : line end * "\n" end |