Class: Html2Text
- Inherits:
-
Object
- Object
- Html2Text
- Defined in:
- lib/isomorfeus/html2text.rb
Constant Summary collapse
- VERSION =
"0.3.1"
- DO_NOT_TOUCH_WHITESPACE =
"<do-not-touch-whitespace>"
Instance Attribute Summary collapse
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
Class Method Summary collapse
Instance Method Summary collapse
- #convert ⇒ Object
-
#initialize(doc) ⇒ Html2Text
constructor
A new instance of Html2Text.
- #remove_leading_and_trailing_whitespace(text) ⇒ Object
Constructor Details
#initialize(doc) ⇒ Html2Text
Returns a new instance of Html2Text.
6 7 8 |
# File 'lib/isomorfeus/html2text.rb', line 6 def initialize(doc) @doc = doc end |
Instance Attribute Details
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
4 5 6 |
# File 'lib/isomorfeus/html2text.rb', line 4 def doc @doc end |
Class Method Details
.convert(html) ⇒ Object
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
# File 'lib/isomorfeus/html2text.rb', line 10 def self.convert(html) html = html.to_s if is_office_document?(html) # Emulate the CSS rendering of Office documents html = html.gsub("<p class=MsoNormal>", "<br>") .gsub("<o:p> </o:p>", "<br>") .gsub("<o:p></o:p>", "") end if !html.include?("<html") # Stop Nokogiri from inserting in <p> tags html = "<div>#{html}</div>" end html = fix_newlines(replace_entities(html)) doc = Nokogiri::HTML(html) Html2Text.new(doc).convert end |
.fix_newlines(text) ⇒ Object
31 32 33 |
# File 'lib/isomorfeus/html2text.rb', line 31 def self.fix_newlines(text) text.gsub("\r\n", "\n").gsub("\r", "\n") end |
.replace_entities(text) ⇒ Object
35 36 37 |
# File 'lib/isomorfeus/html2text.rb', line 35 def self.replace_entities(text) text.gsub(" ", " ").gsub("\u00a0", " ").gsub("‌", "") end |
Instance Method Details
#convert ⇒ Object
39 40 41 42 43 44 |
# File 'lib/isomorfeus/html2text.rb', line 39 def convert output = iterate_over(doc) output = remove_leading_and_trailing_whitespace(output) output = remove_unnecessary_empty_lines(output) return output.strip end |
#remove_leading_and_trailing_whitespace(text) ⇒ Object
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/isomorfeus/html2text.rb', line 48 def remove_leading_and_trailing_whitespace(text) # ignore any <pre> blocks, which we don't want to interact with pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE) output = [] pre_blocks.each.with_index do |block, index| if index % 2 == 0 output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t") else output << block end end output.join("") end |