Class: Relevance::Tarantula::Crawler
Defined Under Namespace
Classes: CrawlTimeout
Instance Attribute Summary collapse
Instance Method Summary
collapse
#log, #rails_root, #tarantula_home, #verbose
Constructor Details
Returns a new instance of Crawler.
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# File 'lib/relevance/tarantula/crawler.rb', line 18
def initialize
@max_url_length = 1024
@successes = []
@failures = []
@handlers = [@response_code_handler = Result]
@links_queued = Set.new
@form_signatures_queued = Set.new
@crawl_queue = []
@crawl_start_times, @crawl_end_times = [], []
@crawl_timeout = 20.minutes
@referrers = {}
@skip_uri_patterns = [
/^javascript/,
/^mailto/,
/^http/,
]
self.transform_url_patterns = [
[/#.*$/, '']
]
@reporters = [Relevance::Tarantula::IOReporter.new($stderr)]
@decoder = HTMLEntities.new
@times_to_crawl = 1
@fuzzers = [Relevance::Tarantula::FormSubmission]
@stdout_tty = $stdout.tty?
end
|
Dynamic Method Handling
This class handles dynamic methods through the method_missing method
#method_missing(meth, *args) ⇒ Object
45
46
47
48
|
# File 'lib/relevance/tarantula/crawler.rb', line 45
def method_missing(meth, *args)
super unless Result::ALLOW_NNN_FOR =~ meth.to_s
@response_code_handler.send(meth, *args)
end
|
Instance Attribute Details
#crawl_end_times ⇒ Object
Returns the value of attribute crawl_end_times.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_end_times
@crawl_end_times
end
|
#crawl_queue ⇒ Object
Returns the value of attribute crawl_queue.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def crawl_queue
@crawl_queue
end
|
#crawl_start_times ⇒ Object
Returns the value of attribute crawl_start_times.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def crawl_start_times
@crawl_start_times
end
|
#crawl_timeout ⇒ Object
Returns the value of attribute crawl_timeout.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def crawl_timeout
@crawl_timeout
end
|
Returns the value of attribute failures.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def failures
@failures
end
|
Returns the value of attribute form_signatures_queued.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def form_signatures_queued
@form_signatures_queued
end
|
Returns the value of attribute fuzzers.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def fuzzers
@fuzzers
end
|
Returns the value of attribute handlers.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def handlers
@handlers
end
|
#links_queued ⇒ Object
Returns the value of attribute links_queued.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def links_queued
@links_queued
end
|
#log_grabber ⇒ Object
Returns the value of attribute log_grabber.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def log_grabber
@log_grabber
end
|
#max_url_length ⇒ Object
Returns the value of attribute max_url_length.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def max_url_length
@max_url_length
end
|
Returns the value of attribute proxy.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def proxy
@proxy
end
|
#referrers ⇒ Object
Returns the value of attribute referrers.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def referrers
@referrers
end
|
#reporters ⇒ Object
Returns the value of attribute reporters.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def reporters
@reporters
end
|
#response_code_handler ⇒ Object
Returns the value of attribute response_code_handler.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def response_code_handler
@response_code_handler
end
|
#skip_uri_patterns ⇒ Object
Returns the value of attribute skip_uri_patterns.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def skip_uri_patterns
@skip_uri_patterns
end
|
#successes ⇒ Object
Returns the value of attribute successes.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def successes
@successes
end
|
#test_name ⇒ Object
Returns the value of attribute test_name.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def test_name
@test_name
end
|
#times_to_crawl ⇒ Object
Returns the value of attribute times_to_crawl.
12
13
14
|
# File 'lib/relevance/tarantula/crawler.rb', line 12
def times_to_crawl
@times_to_crawl
end
|
Returns the value of attribute transform_url_patterns.
16
17
18
|
# File 'lib/relevance/tarantula/crawler.rb', line 16
def transform_url_patterns
@transform_url_patterns
end
|
Instance Method Details
#blip(number = 0) ⇒ Object
242
243
244
245
246
247
|
# File 'lib/relevance/tarantula/crawler.rb', line 242
def blip(number = 0)
unless verbose
print "\r #{links_completed_count} of #{total_links_count} links completed " if @stdout_tty
timeout_if_too_long(number)
end
end
|
#crawl(url = "/") ⇒ Object
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
# File 'lib/relevance/tarantula/crawler.rb', line 56
def crawl(url = "/")
orig_links_queued = @links_queued.dup
orig_form_signatures_queued = @form_signatures_queued.dup
orig_crawl_queue = @crawl_queue.dup
@times_to_crawl.times do |num|
queue_link url
begin
do_crawl num
rescue CrawlTimeout => e
puts
puts e.message
end
puts "#{(num+1).ordinalize} crawl" if @times_to_crawl > 1
if num + 1 < @times_to_crawl
@links_queued = orig_links_queued
@form_signatures_queued = orig_form_signatures_queued
@crawl_queue = orig_crawl_queue
@referrers = {}
end
end
rescue Interrupt
$stderr.puts "CTRL-C"
ensure
report_results
end
|
#crawl_the_queue(number = 0) ⇒ Object
97
98
99
100
101
102
|
# File 'lib/relevance/tarantula/crawler.rb', line 97
def crawl_the_queue(number = 0)
while (request = @crawl_queue.pop)
request.crawl
blip(number)
end
end
|
#do_crawl(number) ⇒ Object
89
90
91
92
93
94
95
|
# File 'lib/relevance/tarantula/crawler.rb', line 89
def do_crawl(number)
while (!finished?)
@crawl_start_times << Time.now
crawl_the_queue(number)
@crawl_end_times << Time.now
end
end
|
#elasped_time_for_pass(num) ⇒ Object
129
130
131
|
# File 'lib/relevance/tarantula/crawler.rb', line 129
def elasped_time_for_pass(num)
Time.now - crawl_start_times[num]
end
|
#finished? ⇒ Boolean
85
86
87
|
# File 'lib/relevance/tarantula/crawler.rb', line 85
def finished?
@crawl_queue.empty?
end
|
#follow(method, url, data = nil) ⇒ Object
121
122
123
|
# File 'lib/relevance/tarantula/crawler.rb', line 121
def follow(method, url, data=nil)
proxy.send(method, url, data)
end
|
#generate_reports ⇒ Object
211
212
213
214
215
216
217
218
219
220
221
222
223
|
# File 'lib/relevance/tarantula/crawler.rb', line 211
def generate_reports
errors = []
reporters.each do |reporter|
begin
reporter.finish_report(test_name)
rescue RuntimeError => e
errors << e
end
end
unless errors.empty?
raise errors.map(&:message).join("\n")
end
end
|
#grab_log! ⇒ Object
133
134
135
|
# File 'lib/relevance/tarantula/crawler.rb', line 133
def grab_log!
@log_grabber && @log_grabber.grab!
end
|
145
146
147
148
149
150
151
152
153
154
155
|
# File 'lib/relevance/tarantula/crawler.rb', line 145
def handle_form_results(form, response)
handlers.each do |h|
save_result h.handle(Result.new(:method => form.method,
:url => form.action,
:response => response,
:log => grab_log!,
:referrer => form.action,
:data => form.data.inspect,
:test_name => test_name).freeze)
end
end
|
#handle_link_results(link, result) ⇒ Object
110
111
112
113
114
115
116
117
118
119
|
# File 'lib/relevance/tarantula/crawler.rb', line 110
def handle_link_results(link, result)
handlers.each do |h|
begin
save_result h.handle(result)
rescue Exception => e
log "error handling #{link} #{e.message}"
end
end
end
|
#links_completed_count ⇒ Object
238
239
240
|
# File 'lib/relevance/tarantula/crawler.rb', line 238
def links_completed_count
total_links_count - links_remaining_count
end
|
#links_remaining_count ⇒ Object
234
235
236
|
# File 'lib/relevance/tarantula/crawler.rb', line 234
def links_remaining_count
@crawl_queue.size
end
|
#make_result(options) ⇒ Object
137
138
139
140
141
142
143
|
# File 'lib/relevance/tarantula/crawler.rb', line 137
def make_result(options)
defaults = {
:log => grab_log!,
:test_name => test_name
}
Result.new(defaults.merge(options)).freeze
end
|
194
195
196
197
198
199
200
201
202
203
204
205
|
# File 'lib/relevance/tarantula/crawler.rb', line 194
def queue_form(form, referrer = nil)
fuzzers.each do |fuzzer|
fuzzer.mutate(Form.new(form, self, referrer)).each do |fs|
fs.action = transform_url(fs.action)
return if should_skip_form_submission?(fs)
@referrers[fs.action] = referrer if referrer
@crawl_queue << fs
@form_signatures_queued << fs.signature
end
end
end
|
#queue_link(dest, referrer = nil) ⇒ Object
186
187
188
189
190
191
192
|
# File 'lib/relevance/tarantula/crawler.rb', line 186
def queue_link(dest, referrer = nil)
dest = Link.new(dest, self, referrer)
return if should_skip_link?(dest)
@crawl_queue << dest
@links_queued << dest
dest
end
|
#report_dir ⇒ Object
207
208
209
|
# File 'lib/relevance/tarantula/crawler.rb', line 207
def report_dir
File.join(rails_root, "tmp", "tarantula")
end
|
#report_results ⇒ Object
225
226
227
228
|
# File 'lib/relevance/tarantula/crawler.rb', line 225
def report_results
puts "Crawled #{total_links_count} links and forms."
generate_reports
end
|
#save_result(result) ⇒ Object
104
105
106
107
108
|
# File 'lib/relevance/tarantula/crawler.rb', line 104
def save_result(result)
reporters.each do |reporter|
reporter.report(result)
end
end
|
173
174
175
|
# File 'lib/relevance/tarantula/crawler.rb', line 173
def should_skip_form_submission?(fs)
should_skip_url?(fs.action) || @form_signatures_queued.member?(fs.signature)
end
|
#should_skip_link?(link) ⇒ Boolean
169
170
171
|
# File 'lib/relevance/tarantula/crawler.rb', line 169
def should_skip_link?(link)
should_skip_url?(link.href) || @links_queued.member?(link)
end
|
#should_skip_url?(url) ⇒ Boolean
157
158
159
160
161
162
163
164
165
166
167
|
# File 'lib/relevance/tarantula/crawler.rb', line 157
def should_skip_url?(url)
return true if url.blank?
if @skip_uri_patterns.any? {|pattern| pattern =~ url}
log "Skipping #{url}"
return true
end
if url.length > max_url_length
log "Skipping long url #{url}"
return true
end
end
|
#submit(method, action, data) ⇒ Object
125
126
127
|
# File 'lib/relevance/tarantula/crawler.rb', line 125
def submit(method, action, data)
proxy.send(method, action, data)
end
|
#timeout_if_too_long(number = 0) ⇒ Object
249
250
251
252
253
|
# File 'lib/relevance/tarantula/crawler.rb', line 249
def timeout_if_too_long(number = 0)
if elasped_time_for_pass(number) > crawl_timeout
raise CrawlTimeout, "Exceeded crawl timeout of #{crawl_timeout} seconds - skipping to the next crawl..."
end
end
|
#total_links_count ⇒ Object
230
231
232
|
# File 'lib/relevance/tarantula/crawler.rb', line 230
def total_links_count
@links_queued.size + @form_signatures_queued.size
end
|
177
178
179
180
181
182
183
184
|
# File 'lib/relevance/tarantula/crawler.rb', line 177
def transform_url(url)
return unless url
url = @decoder.decode(url)
@transform_url_patterns.each do |pattern|
url = pattern[url]
end
url
end
|