Class: Medusa::Page
- Inherits:
-
Object
- Object
- Medusa::Page
- Defined in:
- lib/medusa/page.rb
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
The raw HTTP response body of the page.
-
#code ⇒ Object
Integer response code of the page.
-
#data ⇒ Object
OpenStruct for user-stored data.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#url ⇒ Object
readonly
The URL of the page.
Class Method Summary collapse
Instance Method Summary collapse
-
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
true
if the page was fetched with no error,false
otherwise. -
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
- #marshal_dump ⇒ Object
- #marshal_load(ary) ⇒ Object
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
- #to_hash ⇒ Object
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/medusa/page.rb', line 36 def initialize(url, params = {}) @url = url @data = OpenStruct.new @links = nil @body = nil @doc = nil @base = nil @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= '' @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end |
Instance Attribute Details
#body ⇒ Object (readonly)
The raw HTTP response body of the page
13 14 15 |
# File 'lib/medusa/page.rb', line 13 def body @body end |
#code ⇒ Object
Integer response code of the page
24 25 26 |
# File 'lib/medusa/page.rb', line 24 def code @code end |
#data ⇒ Object
OpenStruct for user-stored data
22 23 24 |
# File 'lib/medusa/page.rb', line 22 def data @data end |
#depth ⇒ Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
27 28 29 |
# File 'lib/medusa/page.rb', line 27 def depth @depth end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
19 20 21 |
# File 'lib/medusa/page.rb', line 19 def error @error end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
15 16 17 |
# File 'lib/medusa/page.rb', line 15 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
17 18 19 |
# File 'lib/medusa/page.rb', line 17 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
29 30 31 |
# File 'lib/medusa/page.rb', line 29 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
31 32 33 |
# File 'lib/medusa/page.rb', line 31 def response_time @response_time end |
#url ⇒ Object (readonly)
The URL of the page
11 12 13 |
# File 'lib/medusa/page.rb', line 11 def url @url end |
Class Method Details
.from_hash(hash) ⇒ Object
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/medusa/page.rb', line 207 def self.from_hash(hash) page = self.new(URI(hash['url'])) {'@headers' => Marshal.load(hash['headers']), '@data' => Marshal.load(hash['data']), '@body' => hash['body'], '@links' => hash['links'].map { |link| URI(link) }, '@code' => hash['code'].to_i, '@visited' => hash['visited'], '@depth' => hash['depth'].to_i, '@referer' => hash['referer'], '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil, '@response_time' => hash['response_time'].to_i, '@fetched' => hash['fetched'] }.each do |var, value| page.instance_variable_set(var, value) end page end |
Instance Method Details
#base ⇒ Object
Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE
144 145 146 147 148 149 150 151 152 |
# File 'lib/medusa/page.rb', line 144 def base @base = if doc href = doc.search('//head/base/@href') URI(href.to_s) unless href.nil? rescue nil end unless @base return nil if @base && @base.to_s().empty? @base end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
112 113 114 |
# File 'lib/medusa/page.rb', line 112 def content_type headers['content-type'] end |
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
105 106 107 |
# File 'lib/medusa/page.rb', line 105 def WEBrick::Cookie.(@headers['set-cookie']) rescue [] end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
89 90 91 92 |
# File 'lib/medusa/page.rb', line 89 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#doc ⇒ Object
Nokogiri document for the HTML body
81 82 83 84 |
# File 'lib/medusa/page.rb', line 81 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
98 99 100 |
# File 'lib/medusa/page.rb', line 98 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
120 121 122 |
# File 'lib/medusa/page.rb', line 120 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise
180 181 182 |
# File 'lib/medusa/page.rb', line 180 def in_domain?(uri) uri.host == @url.host end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# File 'lib/medusa/page.rb', line 62 def links return @links unless @links.nil? @links = [] return @links if !doc doc.search("//a[@href]").each do |a| next if a['data-method'] && a['data-method'] != 'get' u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end |
#marshal_dump ⇒ Object
184 185 186 |
# File 'lib/medusa/page.rb', line 184 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end |
#marshal_load(ary) ⇒ Object
188 189 190 |
# File 'lib/medusa/page.rb', line 188 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
136 137 138 |
# File 'lib/medusa/page.rb', line 136 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
128 129 130 |
# File 'lib/medusa/page.rb', line 128 def redirect? (300..307).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/medusa/page.rb', line 159 def to_absolute(link) return nil if link.nil? # remove anchor link = link.to_s.gsub(/#.*$/,'') if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION)) link = URI.encode(URI.decode(link)) end relative = URI(link) absolute = base ? base.merge(relative) : @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |
#to_hash ⇒ Object
192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
# File 'lib/medusa/page.rb', line 192 def to_hash {'url' => @url.to_s, 'headers' => Marshal.dump(@headers), 'data' => Marshal.dump(@data), 'body' => @body, 'links' => links.map(&:to_s), 'code' => @code, 'visited' => @visited, 'depth' => @depth, 'referer' => @referer.to_s, 'redirect_to' => @redirect_to.to_s, 'response_time' => @response_time, 'fetched' => @fetched} end |