Class: MList::Util::HtmlTextExtraction

Inherits:

Object

Object
MList::Util::HtmlTextExtraction

show all

Defined in:: lib/mlist/util/email_helpers.rb

Constant Summary collapse

NBSP = We need a way to maintain non-breaking spaces. Hpricot will replace them with ??.chr. We can easily teach it to convert it to a space, but then we lose the information in the Text node that we need to keep the space around, since that is what they would see in a view of the HTML.

'!!!NBSP!!!'

Instance Method Summary collapse

#execute ⇒ Object
#extract_text_from_children(elem) ⇒ Object
#extract_text_from_node(node) ⇒ Object
#extract_text_from_text_node(node) ⇒ Object
#initialize(html) ⇒ HtmlTextExtraction constructor

A new instance of HtmlTextExtraction.

Constructor Details

#initialize(html) ⇒ `HtmlTextExtraction`

Returns a new instance of HtmlTextExtraction.



12
13
14

# File 'lib/mlist/util/email_helpers.rb', line 12

def initialize(html)
  @doc = Hpricot(html.gsub('&nbsp;', NBSP))
end

Instance Method Details

#execute ⇒ `Object`

# File 'lib/mlist/util/email_helpers.rb', line 16

def execute
  @text, @anchors = '', []
  @doc.each_child do |node|
    extract_text_from_node(node) if Hpricot::Elem::Trav === node
  end
  @text.strip!
  unless @anchors.empty?
    refs = []
    @anchors.each_with_index do |href, i|
      refs << "[#{i+1}] #{href}"
    end
    @text << "\n\n--\n#{refs.join("\n")}"
  end
  @text.gsub(NBSP, ' ')
end

#extract_text_from_children(elem) ⇒ `Object`

# File 'lib/mlist/util/email_helpers.rb', line 71

def extract_text_from_children(elem)
  elem.each_child do |node|
    case node
    when Hpricot::Text::Trav
      extract_text_from_text_node(node)
    when Hpricot::Elem::Trav
      extract_text_from_node(node)
    end
  end
end

#extract_text_from_node(node) ⇒ `Object`

# File 'lib/mlist/util/email_helpers.rb', line 32

def extract_text_from_node(node)
  case node.name
  when 'head'
  when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
    @text << node.inner_text
    @text << "\n\n"
  when 'br'
    @text << "\n"
  when 'ol'
    node.children_of_type('li').each_with_index do |li, i|
      @text << " #{i+1}. #{li.inner_text}"
      @text << "\n\n"
    end
  when 'ul'
    node.children_of_type('li').each do |li|
      @text << " * #{li.inner_text.strip}"
      @text << "\n\n"
    end
  when 'strong'
    @text << "*#{node.inner_text}*"
  when 'em'
    @text << "_#{node.inner_text}_"
  when 'dl'
    node.traverse_element('dt', 'dd') do |dt_dd|
      extract_text_from_node(dt_dd)
    end
  when 'a'
    @anchors << node['href']
    extract_text_from_text_node(node)
    @text << "[#{@anchors.size}]"
  when 'p', 'dt', 'dd'
    extract_text_from_children(node)
    @text.rstrip!
    @text << "\n\n"
  else
    extract_text_from_children(node)
  end
end

#extract_text_from_text_node(node) ⇒ `Object`

# File 'lib/mlist/util/email_helpers.rb', line 82

def extract_text_from_text_node(node)
  text = @text.end_with?("\n") ? node.inner_text.lstrip : node.inner_text
  @text << text.gsub(/\s{2,}/, ' ').sub(/\n/, '')
end

Class: MList::Util::HtmlTextExtraction

Constant Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ HtmlTextExtraction

Instance Method Details

#execute ⇒ Object

#extract_text_from_children(elem) ⇒ Object

#extract_text_from_node(node) ⇒ Object

#extract_text_from_text_node(node) ⇒ Object

#initialize(html) ⇒ `HtmlTextExtraction`

#execute ⇒ `Object`

#extract_text_from_children(elem) ⇒ `Object`

#extract_text_from_node(node) ⇒ `Object`

#extract_text_from_text_node(node) ⇒ `Object`