Class: Html2Text

Inherits:
Object
  • Object
show all
Defined in:
lib/html2text.rb,
lib/html2text/version.rb

Constant Summary collapse

DO_NOT_TOUCH_WHITESPACE =
'<do-not-touch-whitespace>'
VERSION =
'0.4.0'

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc) ⇒ Html2Text

Returns a new instance of Html2Text.



8
9
10
# File 'lib/html2text.rb', line 8

def initialize(doc)
  @doc = doc
end

Instance Attribute Details

#docObject (readonly)

Returns the value of attribute doc.



6
7
8
# File 'lib/html2text.rb', line 6

def doc
  @doc
end

Class Method Details

.convert(html) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/html2text.rb', line 12

def self.convert(html)
  html = html.to_s

  if office_document?(html)
    # Emulate the CSS rendering of Office documents
    html = html.gsub('<p class=MsoNormal>', '<br>')
               .gsub('<o:p>&nbsp;</o:p>', '<br>')
               .gsub('<o:p></o:p>', '')
  end

  unless html.include?('<html')
    # Stop Nokogiri from inserting in <p> tags
    html = "<div>#{html}</div>"
  end

  html = fix_newlines(replace_entities(html))
  doc = Nokogiri::HTML(html)

  new(doc).convert
end

.fix_newlines(text) ⇒ Object



33
34
35
36
37
# File 'lib/html2text.rb', line 33

def self.fix_newlines(text)
  # rubocop:disable Performance/StringReplacement
  text.gsub("\r\n", "\n").gsub("\r", "\n")
  # rubocop:enable Performance/StringReplacement
end

.replace_entities(text) ⇒ Object



39
40
41
42
43
# File 'lib/html2text.rb', line 39

def self.replace_entities(text)
  # rubocop:disable Performance/StringReplacement
  text.gsub('&nbsp;', ' ').gsub("\u00a0", ' ').gsub('&zwnj;', '')
  # rubocop:enable Performance/StringReplacement
end

Instance Method Details

#convertObject



45
46
47
48
49
50
# File 'lib/html2text.rb', line 45

def convert
  output = iterate_over(doc)
  output = remove_leading_and_trailing_whitespace(output)
  output = remove_unnecessary_empty_lines(output)
  output.strip
end

#remove_leading_and_trailing_whitespace(text) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/html2text.rb', line 54

def remove_leading_and_trailing_whitespace(text)
  # ignore any <pre> blocks, which we don't want to interact with
  pre_blocks = text.split(DO_NOT_TOUCH_WHITESPACE)

  output = []
  pre_blocks.each.with_index do |block, index|
    output << if index.even?
                block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
              else
                block
              end
  end

  output.join
end