Class: Crawler::Webcrawler

Inherits:
Object
  • Object
show all
Includes:
Observable
Defined in:
lib/crawler/webcrawler.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Webcrawler

Accepts the following options:

  • timeout – Time limit for the crawl operation, after which a Timeout::Error exception is raised.

  • external – Boolean; whether or not the crawler will go outside the original URI’s host.

  • exclude – A URI will be excluded if it includes any of the strings within this array.



23
24
25
26
27
28
29
30
31
32
# File 'lib/crawler/webcrawler.rb', line 23

def initialize(options={})
  @crawled = Set.new
  @queue = []
  @options = {
    :timeout => 1.0/0, #Infinity
    :external => false,
    :exclude => []
  }.merge(options)
  
end

Instance Attribute Details

#crawledObject

Set of all URIs which have been crawled



13
14
15
# File 'lib/crawler/webcrawler.rb', line 13

def crawled
  @crawled
end

#optionsObject

Hash of options



17
18
19
# File 'lib/crawler/webcrawler.rb', line 17

def options
  @options
end

#queueObject

Queue of URIs to be crawled. Array which acts as a LIFO queue.



15
16
17
# File 'lib/crawler/webcrawler.rb', line 15

def queue
  @queue
end

Instance Method Details

#crawl(start_uri) ⇒ Object

Given a URI object, the crawler will explore every linked page recursively using the Breadth First Search algorithm. Whenever it downloads a page, it notifies observers with an HTTPResponse subclass object and the downloaded URI object.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/crawler/webcrawler.rb', line 36

def crawl(start_uri)
  start_uri = start_uri.normalize
  @queue << start_uri
  
  timeout(@options[:timeout]) {
    while(uri = @queue.shift)
      
      Net::HTTP.start(uri.host, uri.port) do |http|
        
        head = http.head(uri.path)
        next if head.content_type != "text/html" # If the page retrieved is not an HTML document, we'll choke on it anyway. Skip it
        
        resp = http.get(uri.path)

        changed
        notify_observers(resp, uri)
  
        html = Nokogiri.parse(resp.body)
        a_tags = html.search("a")
        @queue = @queue + a_tags.collect do |t|
          begin
            next_uri = uri + t.attribute("href").to_s.strip
          rescue
            nil
          end
        end
        @queue = @queue.compact.uniq
        @queue = @queue.reject {|u| 
          @crawled.include?(u) or
          u == uri or
          !(u.kind_of?(URI::HTTP)) or
          (u.host != uri.host and !@options[:external]) or
          (@options[:exclude].any? { |excl| u.path.include?(excl)})
        }
      end
      @crawled << uri
    end
  }
  
end