Class: Anemone::Page
- Includes:
- Arachni::UI::Output
- Defined in:
- lib/anemone/page.rb
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#data ⇒ Object
OpenStruct for user-stored data.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#url ⇒ Object
readonly
The URL of the page.
-
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!.
Class Method Summary collapse
Instance Method Summary collapse
- #base ⇒ Object
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
- #dir(url) ⇒ Object
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
-
#extract_domain(url) ⇒ String
Extracts the domain from a URI object.
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
true
if the page was fetched with no error,false
otherwise. -
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Array<URI>
Array of distinct links to follow.
- #marshal_dump ⇒ Object
- #marshal_load(ary) ⇒ Object
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#run_modules ⇒ Array
Runs all Spider (path extraction) modules and returns an array of paths.
-
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
Methods included from Arachni::UI::Output
#buffer, #debug!, #debug?, #flush_buffer, #mute!, #muted?, #only_positives!, #only_positives?, #print_debug, #print_debug_backtrace, #print_debug_pp, #print_error, #print_error_backtrace, #print_info, #print_line, #print_ok, #print_status, #print_verbose, #reroute_to_file, #reroute_to_file?, #unmute!, #verbose!, #verbose?
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/anemone/page.rb', line 89 def initialize(url, params = {}) @url = url @data = OpenStruct.new @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end |
Instance Attribute Details
#body ⇒ Object (readonly)
The raw HTTP response body of the page
64 65 66 |
# File 'lib/anemone/page.rb', line 64 def body @body end |
#code ⇒ Object
Integer response code of the page
75 76 77 |
# File 'lib/anemone/page.rb', line 75 def code @code end |
#data ⇒ Object
OpenStruct for user-stored data
73 74 75 |
# File 'lib/anemone/page.rb', line 73 def data @data end |
#depth ⇒ Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
80 81 82 |
# File 'lib/anemone/page.rb', line 80 def depth @depth end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
70 71 72 |
# File 'lib/anemone/page.rb', line 70 def error @error end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
66 67 68 |
# File 'lib/anemone/page.rb', line 66 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
68 69 70 |
# File 'lib/anemone/page.rb', line 68 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
82 83 84 |
# File 'lib/anemone/page.rb', line 82 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
84 85 86 |
# File 'lib/anemone/page.rb', line 84 def response_time @response_time end |
#url ⇒ Object (readonly)
The URL of the page
62 63 64 |
# File 'lib/anemone/page.rb', line 62 def url @url end |
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
77 78 79 |
# File 'lib/anemone/page.rb', line 77 def visited @visited end |
Class Method Details
.from_hash(hash) ⇒ Object
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 |
# File 'lib/anemone/page.rb', line 318 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => URI(hash['redirect_to']), '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
Instance Method Details
#base ⇒ Object
253 254 255 256 257 258 259 260 |
# File 'lib/anemone/page.rb', line 253 def base begin tmp = doc.search( '//base[@href]' ) return tmp[0]['href'].dup rescue return end end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
201 202 203 |
# File 'lib/anemone/page.rb', line 201 def content_type headers['content-type'].first end |
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
194 195 196 |
# File 'lib/anemone/page.rb', line 194 def WEBrick::Cookie.(@headers['Set-Cookie']) rescue [] end |
#dir(url) ⇒ Object
133 134 135 |
# File 'lib/anemone/page.rb', line 133 def dir( url ) URI( File.dirname( URI( url.to_s ).path ) + '/' ) end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
178 179 180 181 |
# File 'lib/anemone/page.rb', line 178 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#doc ⇒ Object
Nokogiri document for the HTML body
167 168 169 170 171 172 173 |
# File 'lib/anemone/page.rb', line 167 def doc type = Arachni::HTTP.content_type( @headers ) return if type.is_a?( String) && !type.substring?( 'text' ) return @doc if @doc @doc = Nokogiri::HTML( @body ) if @body rescue nil end |
#extract_domain(url) ⇒ String
Extracts the domain from a URI object
283 284 285 286 287 288 289 290 291 292 |
# File 'lib/anemone/page.rb', line 283 def extract_domain( url ) if !url.host then return false end splits = url.host.split( /\./ ) if splits.length == 1 then return true end splits[-2] + "." + splits[-1] end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
187 188 189 |
# File 'lib/anemone/page.rb', line 187 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
209 210 211 |
# File 'lib/anemone/page.rb', line 209 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise.
The added code enables optional subdomain crawling.
268 269 270 271 272 273 274 |
# File 'lib/anemone/page.rb', line 268 def in_domain?( uri ) if( Arachni::Options.instance.follow_subdomains ) return extract_domain( uri ) == extract_domain( @url ) end uri.host == @url.host end |
#links ⇒ Array<URI>
Array of distinct links to follow
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
# File 'lib/anemone/page.rb', line 142 def links return @links unless @links.nil? @links = [] return @links if !doc run_modules( ).each { |path| next if path.nil? or path.empty? abs = to_absolute( URI( path ) ) rescue next if in_domain?( abs ) @links << abs # force dir listing # ap to_absolute( get_path( abs.to_s ).to_s ).to_s # @links << to_absolute( dir( abs.to_s ).to_s ) rescue next end } @links.uniq! return @links end |
#marshal_dump ⇒ Object
295 296 297 |
# File 'lib/anemone/page.rb', line 295 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end |
#marshal_load(ary) ⇒ Object
299 300 301 |
# File 'lib/anemone/page.rb', line 299 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
225 226 227 |
# File 'lib/anemone/page.rb', line 225 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
217 218 219 |
# File 'lib/anemone/page.rb', line 217 def redirect? (300..307).include?(@code) end |
#run_modules ⇒ Array
Runs all Spider (path extraction) modules and returns an array of paths
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/anemone/page.rb', line 112 def run_modules opts = Arachni::Options.instance require opts.dir['lib'] + 'component_manager' lib = opts.dir['root'] + 'path_extractors/' begin @@manager ||= ::Arachni::ComponentManager.new( lib, Extractors ) return @@manager.available.map { |name| @@manager[name].new.run( doc ) }.flatten.uniq rescue ::Exception => e print_error( e.to_s ) print_debug_backtrace( e ) end end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
# File 'lib/anemone/page.rb', line 233 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) if url = base base_url = URI(url) else base_url = @url.dup end relative = URI(link) absolute = base_url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |
#to_hash ⇒ Object
303 304 305 306 307 308 309 310 311 312 313 314 315 316 |
# File 'lib/anemone/page.rb', line 303 def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end |