Class: InvertedIndex::Parse

Inherits:
Object
  • Object
show all
Defined in:
lib/inverted_index/parse.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(html) ⇒ Parse

Returns a new instance of Parse.



6
7
8
# File 'lib/inverted_index/parse.rb', line 6

def initialize(html)
  @html = html
end

Instance Attribute Details

#bodyObject

Returns the value of attribute body.



5
6
7
# File 'lib/inverted_index/parse.rb', line 5

def body
  @body
end

#docObject

Returns the value of attribute doc.



5
6
7
# File 'lib/inverted_index/parse.rb', line 5

def doc
  @doc
end

#htmlObject

Returns the value of attribute html.



5
6
7
# File 'lib/inverted_index/parse.rb', line 5

def html
  @html
end

#textObject

Returns the value of attribute text.



5
6
7
# File 'lib/inverted_index/parse.rb', line 5

def text
  @text
end

#tokensObject

Returns the value of attribute tokens.



5
6
7
# File 'lib/inverted_index/parse.rb', line 5

def tokens
  @tokens
end

Instance Method Details

#clean(text) ⇒ Object



37
38
39
40
# File 'lib/inverted_index/parse.rb', line 37

def clean(text)
  # Replace new line and tabs with space
  return text.gsub(/(\n|\t)/,' ').strip
end

#parseObject



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/inverted_index/parse.rb', line 10

def parse
  @doc = Hpricot(@html)

  # Remove sections that shouldn't be indexed
  @doc.search('head').remove
  @doc.search('script').remove
  @doc.search('style').remove
  @doc.search('iframe').remove
  @doc.search('embed').remove

  # Get all text nodes
  @text_nodes = (@doc/"body//*/text()")
  @tokens = []

  # Clean up each text node
  @text_nodes.each do |node|
    text = node.to_plain_text.strip
    words = clean(text).split(' ')
    words.each do |word|
      @tokens << word unless word.empty?
    end
  end
  
  # Return text separated by spaces
  @text = @tokens.join(' ')
end