Class: Arachnid

Inherits:
Object
  • Object
show all
Defined in:
lib/arachnid.rb

Instance Method Summary collapse

Constructor Details

#initialize(url, options = {}) ⇒ Arachnid

Returns a new instance of Arachnid.



11
12
13
14
15
16
17
18
19
20
# File 'lib/arachnid.rb', line 11

def initialize(url, options = {})
	@start_url = url
	@domain = parse_domain(url)

	@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
	@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
	@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
	
	@debug = options[:debug] ? options[:debug] : false
end

Instance Method Details

#crawl(options = {}) ⇒ Object



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# File 'lib/arachnid.rb', line 22

def crawl(options = {})

	#defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains
	threads = options[:threads] ? options[:threads] : 1
	#defaults to -1 so it will always keep running until it runs out of urls
	max_urls = options[:max_urls] ? options[:max_urls] : nil

	@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
	@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
	@global_queue = []

	@global_queue << @start_url
	
	while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
		temp_queue = @global_queue

		temp_queue.each do |q|

			begin
				request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true)

				request.on_complete do |response|

					yield response

					links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')

					links.each do |link|
						if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
							
							sanitized_link = sanitize_link(split_url_at_hash(link))
							if(sanitized_link)

								absolute_link = make_absolute(sanitized_link, response.effective_url)
								if(absolute_link)
									@global_queue << absolute_link
								end
							end
						end
					end

				end

				@hydra.queue request

			rescue URI::InvalidURIError, NoMethodError => e
				puts "Exception caught: #{e}" if @debug == true
			end

			@global_visited.insert(q)
			@global_queue.delete(q)

		end

		@hydra.run

	end

end

#ignore_extensions(url) ⇒ Object



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# File 'lib/arachnid.rb', line 128

def ignore_extensions(url)
	return true if url.to_s.length == 0
	return true unless @exclude_urls_with_extensions

	not_found = true

	@exclude_urls_with_extensions.each do |e|
		if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
			not_found = false
			puts "#{e} Found At URL: #{url}" if @debug
		end
	end

	return not_found
end

#internal_link?(url, effective_url) ⇒ Boolean

Returns:

  • (Boolean)


99
100
101
102
103
104
105
106
107
108
109
# File 'lib/arachnid.rb', line 99

def internal_link?(url, effective_url)

	absolute_url = make_absolute(url, effective_url)

	parsed_url = parse_domain(absolute_url)
	if(@domain == parsed_url)
		return true
	else
		return false
	end
end

#make_absolute(href, root) ⇒ Object



152
153
154
155
156
157
158
159
# File 'lib/arachnid.rb', line 152

def make_absolute( href, root )

	begin
  		URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
  	rescue URI::InvalidURIError, URI::InvalidComponentError => e
  		return false
  	end
end

#no_hash_in_url?(url) ⇒ Boolean

Returns:

  • (Boolean)


118
119
120
121
122
123
124
125
126
# File 'lib/arachnid.rb', line 118

def no_hash_in_url?(url)
	return true unless @exclude_urls_with_hash

	if(url.to_s.scan(/#/).size > 0)
		return false
	else
		return true
	end
end

#parse_domain(url) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# File 'lib/arachnid.rb', line 82

def parse_domain(url)
	puts "Parsing URL: #{url}" if @debug

	begin
		parsed_domain = Domainatrix.parse(url)

		if(parsed_domain.subdomain != "")
			parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
		else
			parsed_domain.domain + '.' + parsed_domain.public_suffix
		end
	rescue NoMethodError, Addressable::URI::InvalidURIError => e
		puts "URL Parsing Exception (#{url}): #{e}"
		return nil
	end
end


144
145
146
147
148
149
150
# File 'lib/arachnid.rb', line 144

def sanitize_link(url)
	begin
		return url.gsub(/\s+/, "%20")
	rescue
		return false
	end
end

#split_url_at_hash(url) ⇒ Object



111
112
113
114
115
116
# File 'lib/arachnid.rb', line 111

def split_url_at_hash(url)
	return url.to_s unless @split_url_at_hash

	return url.to_s.split('#')[0]

end