Class: Html2Text
- Inherits:
-
Object
- Object
- Html2Text
- Defined in:
- lib/html2text.rb,
lib/html2text/version.rb
Constant Summary collapse
- DO_NOT_TOUCH_WHITESPACE =
'<do-not-touch-whitespace>'
- VERSION =
'0.4.0'
Instance Attribute Summary collapse
-
#doc ⇒ Object
readonly
Returns the value of attribute doc.
Class Method Summary collapse
Instance Method Summary collapse
- #convert ⇒ Object
-
#initialize(doc) ⇒ Html2Text
constructor
A new instance of Html2Text.
- #remove_leading_and_trailing_whitespace(text) ⇒ Object
Constructor Details
#initialize(doc) ⇒ Html2Text
Returns a new instance of Html2Text.
8 9 10 |
# File 'lib/html2text.rb', line 8 def initialize(doc) @doc = doc end |
Instance Attribute Details
#doc ⇒ Object (readonly)
Returns the value of attribute doc.
6 7 8 |
# File 'lib/html2text.rb', line 6 def doc @doc end |
Class Method Details
.convert(html) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/html2text.rb', line 12 def self.convert(html) html = html.to_s if office_document?(html) # Emulate the CSS rendering of Office documents html = html.gsub('<p class=MsoNormal>', '<br>') .gsub('<o:p> </o:p>', '<br>') .gsub('<o:p></o:p>', '') end unless html.include?('<html') # Stop Nokogiri from inserting in <p> tags html = "<div>#{html}</div>" end html = fix_newlines(replace_entities(html)) doc = Nokogiri::HTML(html) new(doc).convert end |
.fix_newlines(text) ⇒ Object
33 34 35 36 37 |
# File 'lib/html2text.rb', line 33 def self.fix_newlines(text) # rubocop:disable Performance/StringReplacement text.gsub("\r\n", "\n").gsub("\r", "\n") # rubocop:enable Performance/StringReplacement end |
.replace_entities(text) ⇒ Object
39 40 41 42 43 |
# File 'lib/html2text.rb', line 39 def self.replace_entities(text) # rubocop:disable Performance/StringReplacement text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '') # rubocop:enable Performance/StringReplacement end |
Instance Method Details
#convert ⇒ Object
45 46 47 48 49 50 |
# File 'lib/html2text.rb', line 45 def convert output = iterate_over(doc) output = remove_leading_and_trailing_whitespace(output) output = remove_unnecessary_empty_lines(output) output.strip end |
#remove_leading_and_trailing_whitespace(text) ⇒ Object
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# File 'lib/html2text.rb', line 54 def remove_leading_and_trailing_whitespace(text) # ignore any <pre> blocks, which we don't want to interact with pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE) output = [] pre_blocks.each.with_index do |block, index| output << if index.even? block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t") else block end end output.join end |