Class: InvertedIndex::Parse

Inherits:

Object

Object
InvertedIndex::Parse

show all

Defined in:: lib/inverted_index/parse.rb

Instance Attribute Summary collapse

#body ⇒ Object

Returns the value of attribute body.
#doc ⇒ Object

Returns the value of attribute doc.
#html ⇒ Object

Returns the value of attribute html.
#text ⇒ Object

Returns the value of attribute text.
#tokens ⇒ Object

Returns the value of attribute tokens.

Instance Method Summary collapse

#clean(text) ⇒ Object
#initialize(html) ⇒ Parse constructor

A new instance of Parse.
#parse ⇒ Object

Constructor Details

#initialize(html) ⇒ `Parse`

Returns a new instance of Parse.



6
7
8

# File 'lib/inverted_index/parse.rb', line 6

def initialize(html)
  @html = html
end

Instance Attribute Details

#body ⇒ `Object`

Returns the value of attribute body.



5
6
7

# File 'lib/inverted_index/parse.rb', line 5

def body
  @body
end

#doc ⇒ `Object`

Returns the value of attribute doc.



5
6
7

# File 'lib/inverted_index/parse.rb', line 5

def doc
  @doc
end

#html ⇒ `Object`

Returns the value of attribute html.



5
6
7

# File 'lib/inverted_index/parse.rb', line 5

def html
  @html
end

#text ⇒ `Object`

Returns the value of attribute text.



5
6
7

# File 'lib/inverted_index/parse.rb', line 5

def text
  @text
end

#tokens ⇒ `Object`

Returns the value of attribute tokens.



5
6
7

# File 'lib/inverted_index/parse.rb', line 5

def tokens
  @tokens
end

Instance Method Details

#clean(text) ⇒ `Object`

# File 'lib/inverted_index/parse.rb', line 37

def clean(text)
  # Replace new line and tabs with space
  return text.gsub(/(\n|\t)/,' ').strip
end

#parse ⇒ `Object`

# File 'lib/inverted_index/parse.rb', line 10

def parse
  @doc = Hpricot(@html)

  # Remove sections that shouldn't be indexed
  @doc.search('head').remove
  @doc.search('script').remove
  @doc.search('style').remove
  @doc.search('iframe').remove
  @doc.search('embed').remove

  # Get all text nodes
  @text_nodes = (@doc/"body//*/text()")
  @tokens = []

  # Clean up each text node
  @text_nodes.each do |node|
    text = node.to_plain_text.strip
    words = clean(text).split(' ')
    words.each do |word|
      @tokens << word unless word.empty?
    end
  end
  
  # Return text separated by spaces
  @text = @tokens.join(' ')
end

Class: InvertedIndex::Parse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ Parse

Instance Attribute Details

#body ⇒ Object

#doc ⇒ Object

#html ⇒ Object

#text ⇒ Object

#tokens ⇒ Object