Class: Relevance::Tarantula::Crawler
Defined Under Namespace
Classes: CrawlTimeout
Instance Attribute Summary collapse
Instance Method Summary
collapse
#log, #rails_root, #tarantula_home, #verbose
Constructor Details
Returns a new instance of Crawler.
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
# File 'lib/relevance/tarantula/crawler.rb', line 18
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@links_to_crawl = []
@forms_to_crawl = []
@crawl_start_times, @crawl_end_times = [], []
@crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
]
self.transform_url_patterns = [
[/#.*$/, '']
]
@reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
@decoder = HTMLEntities.new
@times_to_crawl = 1
@fuzzers = [Relevance::Tarantula::FormSubmission]
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(meth, *args) ⇒ Object
44
45
46
47
|
# File 'lib/relevance/tarantula/crawler.rb', line 44
def method_missing(meth, *args)
super unless Result::ALLOW_NNN_FOR =~ meth.to_s
@response_code_handler.send(meth, *args)
end
|
Instance Attribute Details
#crawl_end_times ⇒ Object
Returns the value of attribute crawl_end_times.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_end_times
@crawl_end_times
end
|
#crawl_start_times ⇒ Object
Returns the value of attribute crawl_start_times.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_start_times
@crawl_start_times
end
|
#crawl_timeout ⇒ Object
Returns the value of attribute crawl_timeout.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def crawl_timeout
@crawl_timeout
end
|
Returns the value of attribute failures.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def failures
@failures
end
|
Returns the value of attribute form_signatures_queued.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def form_signatures_queued
@form_signatures_queued
end
|
Returns the value of attribute forms_to_crawl.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def forms_to_crawl
@forms_to_crawl
end
|
Returns the value of attribute fuzzers.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def fuzzers
@fuzzers
end
|
Returns the value of attribute handlers.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def handlers
@handlers
end
|
#links_queued ⇒ Object
Returns the value of attribute links_queued.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def links_queued
@links_queued
end
|
#links_to_crawl ⇒ Object
Returns the value of attribute links_to_crawl.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def links_to_crawl
@links_to_crawl
end
|
#log_grabber ⇒ Object
Returns the value of attribute log_grabber.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def log_grabber
@log_grabber
end
|
#max_url_length ⇒ Object
Returns the value of attribute max_url_length.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def max_url_length
@max_url_length
end
|
Returns the value of attribute proxy.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def proxy
@proxy
end
|
#referrers ⇒ Object
Returns the value of attribute referrers.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def referrers
@referrers
end
|
#reporters ⇒ Object
Returns the value of attribute reporters.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def reporters
@reporters
end
|
#response_code_handler ⇒ Object
Returns the value of attribute response_code_handler.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def response_code_handler
@response_code_handler
end
|
#skip_uri_patterns ⇒ Object
Returns the value of attribute skip_uri_patterns.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def skip_uri_patterns
@skip_uri_patterns
end
|
#successes ⇒ Object
Returns the value of attribute successes.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def successes
@successes
end
|
#test_name ⇒ Object
Returns the value of attribute test_name.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def test_name
@test_name
end
|
#times_to_crawl ⇒ Object
Returns the value of attribute times_to_crawl.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def times_to_crawl
@times_to_crawl
end
|
Returns the value of attribute transform_url_patterns.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def transform_url_patterns
@transform_url_patterns
end
|
Instance Method Details
#blip(number = 0) ⇒ Object
252
253
254
255
256
257
|
# File 'lib/relevance/tarantula/crawler.rb', line 252
def blip(number = 0)
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed "
timeout_if_too_long(number)
end
end
|
#crawl(url = "/") ⇒ Object
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# File 'lib/relevance/tarantula/crawler.rb', line 55
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_links_to_crawl = @links_to_crawl.dup
orig_forms_to_crawl = @forms_to_crawl.dup
@times_to_crawl.times do |num|
queue_link url
begin
do_crawl num
rescue CrawlTimeout => e
puts e.message
end
puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@links_to_crawl = orig_links_to_crawl
@forms_to_crawl = orig_forms_to_crawl
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
|
129
130
131
132
133
134
135
136
|
# File 'lib/relevance/tarantula/crawler.rb', line 129
def crawl_form(form)
response = proxy.send(form.method, form.action, form.data)
log "Response #{response.code} for #{form}"
response
rescue ActiveRecord::RecordNotFound => e
log "Skipping #{form.action}, presumed ok that record is missing"
Relevance::Tarantula::Response.new(:code => "404", :body => e.message, :content_type => "text/plain")
end
|
138
139
140
141
142
143
144
|
# File 'lib/relevance/tarantula/crawler.rb', line 138
def crawl_queued_forms(number = 0)
while (form = @forms_to_crawl.pop)
response = crawl_form(form)
handle_form_results(form, response)
blip(number)
end
end
|
#crawl_queued_links(number = 0) ⇒ Object
98
99
100
101
102
103
104
105
|
# File 'lib/relevance/tarantula/crawler.rb', line 98
def crawl_queued_links(number = 0)
while (link = @links_to_crawl.pop)
response = proxy.send(link.method, link.href)
log "Response #{response.code} for #{link}"
handle_link_results(link, response)
blip(number)
end
end
|
#do_crawl(number) ⇒ Object
89
90
91
92
93
94
95
96
|
# File 'lib/relevance/tarantula/crawler.rb', line 89
def do_crawl(number)
while (!finished?)
@crawl_start_times << Time.now
crawl_queued_links(number)
crawl_queued_forms(number)
@crawl_end_times << Time.now
end
end
|
#elasped_time_for_pass(num) ⇒ Object
146
147
148
|
# File 'lib/relevance/tarantula/crawler.rb', line 146
def elasped_time_for_pass(num)
Time.now - crawl_start_times[num]
end
|
#finished? ⇒ Boolean
85
86
87
|
# File 'lib/relevance/tarantula/crawler.rb', line 85
def finished?
@links_to_crawl.empty? && @forms_to_crawl.empty?
end
|
#generate_reports ⇒ Object
222
223
224
225
226
227
228
229
230
231
232
233
234
|
# File 'lib/relevance/tarantula/crawler.rb', line 222
def generate_reports
errors = []
reporters.each do |reporter|
begin
reporter.finish_report(test_name)
rescue RuntimeError => e
errors << e
end
end
unless errors.empty?
raise errors.map(&:message).join("\n")
end
end
|
#grab_log! ⇒ Object
150
151
152
|
# File 'lib/relevance/tarantula/crawler.rb', line 150
def grab_log!
@log_grabber && @log_grabber.grab!
end
|
154
155
156
157
158
159
160
161
162
163
164
|
# File 'lib/relevance/tarantula/crawler.rb', line 154
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.method,
:url => form.action,
:response => response,
:log => grab_log!,
:referrer => form.action,
:data => form.data.inspect,
:test_name => test_name).freeze)
end
end
|
#handle_link_results(link, response) ⇒ Object
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
# File 'lib/relevance/tarantula/crawler.rb', line 113
def handle_link_results(link, response)
handlers.each do |h|
begin
save_result h.handle(Result.new(:method => link.method,
:url => link.href,
:response => response,
:log => grab_log!,
:referrer => referrers[link],
:test_name => test_name).freeze)
rescue Exception => e
log "error handling #{link} #{e.message}"
end
end
end
|
#links_completed_count ⇒ Object
248
249
250
|
# File 'lib/relevance/tarantula/crawler.rb', line 248
def links_completed_count
total_links_count - links_remaining_count
end
|
#links_remaining_count ⇒ Object
244
245
246
|
# File 'lib/relevance/tarantula/crawler.rb', line 244
def links_remaining_count
@links_to_crawl.size + @forms_to_crawl.size
end
|
205
206
207
208
209
210
211
212
213
214
215
216
|
# File 'lib/relevance/tarantula/crawler.rb', line 205
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
fuzzer.mutate(Form.new(form)).each do |fs|
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
@forms_to_crawl << fs
@form_signatures_queued << fs.signature
end
end
end
|
#queue_link(dest, referrer = nil) ⇒ Object
195
196
197
198
199
200
201
202
203
|
# File 'lib/relevance/tarantula/crawler.rb', line 195
def queue_link(dest, referrer = nil)
dest = Link.new(dest)
dest.href = transform_url(dest.href)
return if should_skip_link?(dest)
@referrers[dest] = referrer if referrer
@links_to_crawl << dest
@links_queued << dest
dest
end
|
#report_dir ⇒ Object
218
219
220
|
# File 'lib/relevance/tarantula/crawler.rb', line 218
def report_dir
File.join(rails_root, "tmp", "tarantula")
end
|
#report_results ⇒ Object
236
237
238
|
# File 'lib/relevance/tarantula/crawler.rb', line 236
def report_results
generate_reports
end
|
#save_result(result) ⇒ Object
107
108
109
110
111
|
# File 'lib/relevance/tarantula/crawler.rb', line 107
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
|
182
183
184
|
# File 'lib/relevance/tarantula/crawler.rb', line 182
def should_skip_form_submission?(fs)
should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end
|
#should_skip_link?(link) ⇒ Boolean
178
179
180
|
# File 'lib/relevance/tarantula/crawler.rb', line 178
def should_skip_link?(link)
should_skip_url?(link.href) || @links_queued.member?(link)
end
|
#should_skip_url?(url) ⇒ Boolean
166
167
168
169
170
171
172
173
174
175
176
|
# File 'lib/relevance/tarantula/crawler.rb', line 166
def should_skip_url?(url)
return true if url.blank?
if @skip_uri_patterns.any? {|pattern| pattern =~ url}
log "Skipping #{url}"
return true
end
if url.length > max_url_length
log "Skipping long url #{url}"
return true
end
end
|
#timeout_if_too_long(number = 0) ⇒ Object
259
260
261
262
263
|
# File 'lib/relevance/tarantula/crawler.rb', line 259
def timeout_if_too_long(number = 0)
if elasped_time_for_pass(number) > crawl_timeout
raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
end
end
|
#total_links_count ⇒ Object
240
241
242
|
# File 'lib/relevance/tarantula/crawler.rb', line 240
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
|
186
187
188
189
190
191
192
193
|
# File 'lib/relevance/tarantula/crawler.rb', line 186
def transform_url(url)
return unless url
url = @decoder.decode(url)
@transform_url_patterns.each do |pattern|
url = pattern[url]
end
url
end
|