Class: Relevance::Tarantula::Crawler

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Relevance::Tarantula
Defined in:
lib/relevance/tarantula/crawler.rb

Defined Under Namespace

Classes: CrawlTimeout

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Relevance::Tarantula

#log, #rails_root, #tarantula_home, #verbose

Constructor Details

#initializeCrawler

Returns a new instance of Crawler.



18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# File 'lib/relevance/tarantula/crawler.rb', line 18

def initialize
  @max_url_length = 1024
  @successes = []
  @failures = []
  @handlers = [@response_code_handler = Result]
  @links_queued = Set.new
  @form_signatures_queued = Set.new
  @crawl_queue = []
  @crawl_start_times, @crawl_end_times = [], []
  @crawl_timeout = 20.minutes
  @referrers = {}
  @skip_uri_patterns = [
    /^javascript/,
    /^mailto/,
    /^http/,
  ]
  self.transform_url_patterns = [
    [/#.*$/, '']
  ]
  @reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
  @decoder = HTMLEntities.new
  @times_to_crawl = 1
  @fuzzers = [Relevance::Tarantula::FormSubmission]
  
  @stdout_tty = $stdout.tty?
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(meth, *args) ⇒ Object



45
46
47
48
# File 'lib/relevance/tarantula/crawler.rb', line 45

def method_missing(meth, *args)
  super unless Result::ALLOW_NNN_FOR =~ meth.to_s
  @response_code_handler.send(meth, *args)
end

Instance Attribute Details

#crawl_end_timesObject (readonly)

Returns the value of attribute crawl_end_times.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def crawl_end_times
  @crawl_end_times
end

#crawl_queueObject

Returns the value of attribute crawl_queue.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def crawl_queue
  @crawl_queue
end

#crawl_start_timesObject (readonly)

Returns the value of attribute crawl_start_times.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def crawl_start_times
  @crawl_start_times
end

#crawl_timeoutObject

Returns the value of attribute crawl_timeout.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def crawl_timeout
  @crawl_timeout
end

#failuresObject (readonly)

Returns the value of attribute failures.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def failures
  @failures
end

#form_signatures_queuedObject

Returns the value of attribute form_signatures_queued.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def form_signatures_queued
  @form_signatures_queued
end

#fuzzersObject

Returns the value of attribute fuzzers.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def fuzzers
  @fuzzers
end

#handlersObject

Returns the value of attribute handlers.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def handlers
  @handlers
end

Returns the value of attribute links_queued.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def links_queued
  @links_queued
end

#log_grabberObject

Returns the value of attribute log_grabber.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def log_grabber
  @log_grabber
end

#max_url_lengthObject

Returns the value of attribute max_url_length.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def max_url_length
  @max_url_length
end

#proxyObject

Returns the value of attribute proxy.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def proxy
  @proxy
end

#referrersObject (readonly)

Returns the value of attribute referrers.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def referrers
  @referrers
end

#reportersObject

Returns the value of attribute reporters.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def reporters
  @reporters
end

#response_code_handlerObject

Returns the value of attribute response_code_handler.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def response_code_handler
  @response_code_handler
end

#skip_uri_patternsObject

Returns the value of attribute skip_uri_patterns.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def skip_uri_patterns
  @skip_uri_patterns
end

#successesObject (readonly)

Returns the value of attribute successes.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def successes
  @successes
end

#test_nameObject

Returns the value of attribute test_name.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def test_name
  @test_name
end

#times_to_crawlObject

Returns the value of attribute times_to_crawl.



12
13
14
# File 'lib/relevance/tarantula/crawler.rb', line 12

def times_to_crawl
  @times_to_crawl
end

#transform_url_patternsObject

Returns the value of attribute transform_url_patterns.



16
17
18
# File 'lib/relevance/tarantula/crawler.rb', line 16

def transform_url_patterns
  @transform_url_patterns
end

Instance Method Details

#blip(number = 0) ⇒ Object



242
243
244
245
246
247
# File 'lib/relevance/tarantula/crawler.rb', line 242

def blip(number = 0)
  unless verbose
    print "\r #{links_completed_count} of #{total_links_count} links completed               " if @stdout_tty
    timeout_if_too_long(number)
  end
end

#crawl(url = "/") ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# File 'lib/relevance/tarantula/crawler.rb', line 56

def crawl(url = "/")
  orig_links_queued = @links_queued.dup
  orig_form_signatures_queued = @form_signatures_queued.dup
  orig_crawl_queue = @crawl_queue.dup
  @times_to_crawl.times do |num|
    queue_link url
    
    begin 
      do_crawl num
    rescue CrawlTimeout => e
      puts
      puts e.message
    end
    
    puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1

    if num + 1 < @times_to_crawl
      @links_queued = orig_links_queued
      @form_signatures_queued = orig_form_signatures_queued
      @crawl_queue = orig_crawl_queue
      @referrers = {}
    end
  end
rescue Interrupt
  $stderr.puts "CTRL-C"
ensure
  report_results
end

#crawl_the_queue(number = 0) ⇒ Object



97
98
99
100
101
102
# File 'lib/relevance/tarantula/crawler.rb', line 97

def crawl_the_queue(number = 0)
  while (request = @crawl_queue.pop)
    request.crawl
    blip(number)
  end
end

#do_crawl(number) ⇒ Object



89
90
91
92
93
94
95
# File 'lib/relevance/tarantula/crawler.rb', line 89

def do_crawl(number)
  while (!finished?)
    @crawl_start_times << Time.now
    crawl_the_queue(number)
    @crawl_end_times << Time.now
  end
end

#elasped_time_for_pass(num) ⇒ Object



129
130
131
# File 'lib/relevance/tarantula/crawler.rb', line 129

def elasped_time_for_pass(num)
  Time.now - crawl_start_times[num]
end

#finished?Boolean

Returns:

  • (Boolean)


85
86
87
# File 'lib/relevance/tarantula/crawler.rb', line 85

def finished?
  @crawl_queue.empty?
end

#follow(method, url, data = nil) ⇒ Object



121
122
123
# File 'lib/relevance/tarantula/crawler.rb', line 121

def follow(method, url, data=nil)
  proxy.send(method, url, data)
end

#generate_reportsObject



211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/relevance/tarantula/crawler.rb', line 211

def generate_reports
  errors = []
  reporters.each do |reporter|
    begin
      reporter.finish_report(test_name)
    rescue RuntimeError => e
      errors << e
    end
  end
  unless errors.empty?
    raise errors.map(&:message).join("\n")
  end
end

#grab_log!Object



133
134
135
# File 'lib/relevance/tarantula/crawler.rb', line 133

def grab_log!
  @log_grabber && @log_grabber.grab!
end

#handle_form_results(form, response) ⇒ Object



145
146
147
148
149
150
151
152
153
154
155
# File 'lib/relevance/tarantula/crawler.rb', line 145

def handle_form_results(form, response)
  handlers.each do |h|
    save_result h.handle(Result.new(:method => form.method,
                                   :url => form.action,
                                   :response => response,
                                   :log => grab_log!,
                                   :referrer => form.action,
                                   :data => form.data.inspect,
                                   :test_name => test_name).freeze)
  end
end


110
111
112
113
114
115
116
117
118
119
# File 'lib/relevance/tarantula/crawler.rb', line 110

def handle_link_results(link, result)
  handlers.each do |h|
    begin
      save_result h.handle(result)
    rescue Exception => e
      log "error handling #{link} #{e.message}"
      # TODO: pass to results
    end
  end
end


238
239
240
# File 'lib/relevance/tarantula/crawler.rb', line 238

def links_completed_count
    total_links_count - links_remaining_count
end


234
235
236
# File 'lib/relevance/tarantula/crawler.rb', line 234

def links_remaining_count
  @crawl_queue.size
end

#make_result(options) ⇒ Object



137
138
139
140
141
142
143
# File 'lib/relevance/tarantula/crawler.rb', line 137

def make_result(options)
  defaults = {
    :log       => grab_log!,
    :test_name => test_name      
  }
  Result.new(defaults.merge(options)).freeze
end

#queue_form(form, referrer = nil) ⇒ Object



194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/relevance/tarantula/crawler.rb', line 194

def queue_form(form, referrer = nil)
  fuzzers.each do |fuzzer|
    fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
      # fs = fuzzer.new(Form.new(form, self, referrer))
      fs.action = transform_url(fs.action)
      return if should_skip_form_submission?(fs)
      @referrers[fs.action] = referrer if referrer
      @crawl_queue << fs
      @form_signatures_queued << fs.signature
    end
  end
end


186
187
188
189
190
191
192
# File 'lib/relevance/tarantula/crawler.rb', line 186

def queue_link(dest, referrer = nil)
  dest = Link.new(dest, self, referrer)
  return if should_skip_link?(dest)
  @crawl_queue << dest
  @links_queued << dest
  dest
end

#report_dirObject



207
208
209
# File 'lib/relevance/tarantula/crawler.rb', line 207

def report_dir
  File.join(rails_root, "tmp", "tarantula")
end

#report_resultsObject



225
226
227
228
# File 'lib/relevance/tarantula/crawler.rb', line 225

def report_results
  puts "Crawled #{total_links_count} links and forms."
  generate_reports
end

#save_result(result) ⇒ Object



104
105
106
107
108
# File 'lib/relevance/tarantula/crawler.rb', line 104

def save_result(result)
  reporters.each do |reporter|
    reporter.report(result)
  end
end

#should_skip_form_submission?(fs) ⇒ Boolean

Returns:

  • (Boolean)


173
174
175
# File 'lib/relevance/tarantula/crawler.rb', line 173

def should_skip_form_submission?(fs)
  should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end

#should_skip_link?(link) ⇒ Boolean

Returns:

  • (Boolean)


169
170
171
# File 'lib/relevance/tarantula/crawler.rb', line 169

def should_skip_link?(link)
  should_skip_url?(link.href) || @links_queued.member?(link)
end

#should_skip_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


157
158
159
160
161
162
163
164
165
166
167
# File 'lib/relevance/tarantula/crawler.rb', line 157

def should_skip_url?(url)
  return true if url.blank?
  if @skip_uri_patterns.any? {|pattern| pattern =~ url}
    log "Skipping #{url}"
    return true
  end
  if url.length > max_url_length
    log "Skipping long url #{url}"
    return true
  end
end

#submit(method, action, data) ⇒ Object



125
126
127
# File 'lib/relevance/tarantula/crawler.rb', line 125

def submit(method, action, data)
  proxy.send(method, action, data)
end

#timeout_if_too_long(number = 0) ⇒ Object



249
250
251
252
253
# File 'lib/relevance/tarantula/crawler.rb', line 249

def timeout_if_too_long(number = 0)
  if elasped_time_for_pass(number) > crawl_timeout
    raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
  end
end


230
231
232
# File 'lib/relevance/tarantula/crawler.rb', line 230

def total_links_count
  @links_queued.size + @form_signatures_queued.size
end

#transform_url(url) ⇒ Object



177
178
179
180
181
182
183
184
# File 'lib/relevance/tarantula/crawler.rb', line 177

def transform_url(url)
  return unless url
  url = @decoder.decode(url)
  @transform_url_patterns.each do |pattern|
    url = pattern[url]
  end
  url
end