Class: Crawler

Inherits:
Object
  • Object
show all
Defined in:
lib/analyzer_tools/crawl.rb

Overview

A fast web crawler that stays on the site it started from. Crawler randomly picks a URL from the page retrieved and follows it. If can’t find a URL for the next page, Crawler starts over from the beginning.

Crawler is multi-threaded and can run as many threads as you choose.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(start_url, threads = 1) ⇒ Crawler

Creates a new Crawler that will start at start_url and run threads concurrent threads.

Raises:

  • (ArgumentError)


25
26
27
28
29
30
31
# File 'lib/analyzer_tools/crawl.rb', line 25

def initialize(start_url, threads = 1)
  raise ArgumentError, "Thread count must be more than 0" if threads < 1
  @start_url = start_url
  @thread_count = threads
  @threads = ThreadGroup.new
  @times = []
end

Instance Attribute Details

#timesObject (readonly)

Array of response times in seconds.



19
20
21
# File 'lib/analyzer_tools/crawl.rb', line 19

def times
  @times
end

Instance Method Details

#do_request(url) ⇒ Object

Performs a request of url and returns the request body.



64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/analyzer_tools/crawl.rb', line 64

def do_request(url)
  req = []
  req << "GET #{url.request_uri} HTTP/1.0"
  req << "Host: #{url.host}"
  req << "User-Agent: RubyCrawl"
  req << ""
  req << ""
  req = req.join "\r\n"
  puts req

  begin
    s = TCPSocket.new url.host, url.port
    s.write req
    s.flush
    response = s.read
  ensure
    s.close unless s.nil?
  end

  headers, body = response.split(/\r\n\r\n/)

  headers = headers.split(/\r\n/)
  status = headers.shift
  headers = Hash[*headers.map { |h| h.split ': ', 2 }.flatten]

  puts status

  case status
  when / 302 / then
    body = "href=\"#{headers['Location']}\""
  when / 500 / then
    body = "href=\"#{@start_url}\""
  end

  return body
end

#extract_url_from(body, original_url) ⇒ Object

Returns a random URL on the same site as original_url from body using original_url to resolve relative paths. If no URL valid is found then the start URL is returned.



126
127
128
129
130
131
132
133
134
135
136
137
138
139
# File 'lib/analyzer_tools/crawl.rb', line 126

def extract_url_from(body, original_url)
  urls = body.scan(/href="(.+?)"/)
  until urls.empty? do
    begin
      rand_url = urls.delete_at(rand(urls.length)).first
      new_url = original_url + rand_url
      return new_url if new_url.host == original_url.host
    rescue URI::InvalidURIError
      retry
    end
  end

  return @start_url
end

#runObject

Begins crawling.



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/analyzer_tools/crawl.rb', line 36

def run
  url = @start_url

  @thread_count.times do
    Thread.start do
      @threads.add Thread.current
      loop do
        puts ">>> #{url}"
        body = timed_request url
        url = extract_url_from body, url
      end
    end
    Thread.pass
  end

  @threads.list.first.join until @threads.list.empty?
end

#stopObject

Stops crawling.



57
58
59
# File 'lib/analyzer_tools/crawl.rb', line 57

def stop
  @threads.list.first.kill until @threads.list.empty?
end

#timeObject

Returns the amount of time taken to execute the given block.



104
105
106
107
108
109
# File 'lib/analyzer_tools/crawl.rb', line 104

def time
  start_time = Time.now.to_f
  yield
  end_time = Time.now.to_f
  return end_time - start_time
end

#timed_request(url) ⇒ Object

Performs a request of url and records the time taken into times. Returns the body of the request.



115
116
117
118
119
# File 'lib/analyzer_tools/crawl.rb', line 115

def timed_request(url)
  body = nil
  @times << time { body = do_request(url) }
  return body
end