Class: Relevance::Tarantula::HtmlDocumentHandler

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Defined in:
lib/relevance/tarantula/html_document_handler.rb

Instance Method Summary collapse

Constructor Details

#initialize(crawler) ⇒ HtmlDocumentHandler

Returns a new instance of HtmlDocumentHandler.



7
8
9
# File 'lib/relevance/tarantula/html_document_handler.rb', line 7

def initialize(crawler)
  @crawler = crawler
end

Instance Method Details

#handle(result) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/relevance/tarantula/html_document_handler.rb', line 19

def handle(result)
  response = result.response
  url = result.url
  return unless response.html?
  body = html_doc_without_stderr_noise(response.body)
  body.search('a').each do |tag|
    queue_link(tag, url)
  end
  body.search('link').each do |tag|
    queue_link(tag, url)
  end
  body.search('form').each do |form|
    form['action'] = url unless form['action']
    queue_form(form, url)
  end
  nil
end

#html_doc_without_stderr_noise(html) ⇒ Object

HTML::Document shouts to stderr when it sees ugly HTML We don’t want this – the InvalidHtmlHandler will deal with it



12
13
14
15
16
17
18
# File 'lib/relevance/tarantula/html_document_handler.rb', line 12

def html_doc_without_stderr_noise(html)  
  body = nil
  Recording.stderr do
    body = Hpricot html
  end       
  body
end