Class: Anemone::Page
- Inherits:
-
Object
- Object
- Anemone::Page
- Defined in:
- lib/anemone/page.rb
Instance Attribute Summary collapse
-
#body ⇒ Object
readonly
HTML body.
-
#code ⇒ Object
Integer response code of the page.
-
#data ⇒ Object
OpenStruct for user-stored data.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#error ⇒ Object
readonly
Exception object, if one was raised during HTTP#fetch_page.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#redirect_to ⇒ Object
readonly
URL of the page this one redirected to, if any.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#url ⇒ Object
readonly
The URL of the page.
-
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!.
Instance Method Summary collapse
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
-
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
-
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
-
#fetched? ⇒ Boolean
Was the page successfully fetched?
true
if the page was fetched with no error,false
otherwise. -
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, params = {}) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
- #marshal_dump ⇒ Object
- #marshal_load(ary) ⇒ Object
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
Constructor Details
#initialize(url, params = {}) ⇒ Page
Create a new page
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/anemone/page.rb', line 38 def initialize(url, params = {}) @url = url @data = OpenStruct.new @code = params[:code] @headers = params[:headers] || {} @headers['content-type'] ||= [''] @aliases = Array(params[:aka]).compact @referer = params[:referer] @depth = params[:depth] || 0 @redirect_to = to_absolute(params[:redirect_to]) @response_time = params[:response_time] @body = params[:body] @error = params[:error] @fetched = !params[:code].nil? end |
Instance Attribute Details
#body ⇒ Object (readonly)
HTML body
11 12 13 |
# File 'lib/anemone/page.rb', line 11 def body @body end |
#code ⇒ Object
Integer response code of the page
24 25 26 |
# File 'lib/anemone/page.rb', line 24 def code @code end |
#data ⇒ Object
OpenStruct for user-stored data
22 23 24 |
# File 'lib/anemone/page.rb', line 22 def data @data end |
#depth ⇒ Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.
29 30 31 |
# File 'lib/anemone/page.rb', line 29 def depth @depth end |
#error ⇒ Object (readonly)
Exception object, if one was raised during HTTP#fetch_page
17 18 19 |
# File 'lib/anemone/page.rb', line 17 def error @error end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
13 14 15 |
# File 'lib/anemone/page.rb', line 13 def headers @headers end |
#redirect_to ⇒ Object (readonly)
URL of the page this one redirected to, if any
15 16 17 |
# File 'lib/anemone/page.rb', line 15 def redirect_to @redirect_to end |
#referer ⇒ Object
URL of the page that brought us to this page
31 32 33 |
# File 'lib/anemone/page.rb', line 31 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
33 34 35 |
# File 'lib/anemone/page.rb', line 33 def response_time @response_time end |
#url ⇒ Object (readonly)
The URL of the page
9 10 11 |
# File 'lib/anemone/page.rb', line 9 def url @url end |
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
26 27 28 |
# File 'lib/anemone/page.rb', line 26 def visited @visited end |
Instance Method Details
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
108 109 110 |
# File 'lib/anemone/page.rb', line 108 def content_type headers['content-type'].first end |
#cookies ⇒ Object
Array of cookies received with this page as WEBrick::Cookie objects.
101 102 103 |
# File 'lib/anemone/page.rb', line 101 def WEBrick::Cookie.(@headers['Set-Cookie']) rescue [] end |
#discard_doc! ⇒ Object
Delete the Nokogiri document and response body to conserve memory
85 86 87 88 |
# File 'lib/anemone/page.rb', line 85 def discard_doc! links # force parsing of page links before we trash the document @doc = @body = nil end |
#doc ⇒ Object
Nokogiri document for the HTML body
77 78 79 80 |
# File 'lib/anemone/page.rb', line 77 def doc return @doc if @doc @doc = Nokogiri::HTML(@body) if @body && html? rescue nil end |
#fetched? ⇒ Boolean
Was the page successfully fetched? true
if the page was fetched with no error, false
otherwise.
94 95 96 |
# File 'lib/anemone/page.rb', line 94 def fetched? @fetched end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
116 117 118 |
# File 'lib/anemone/page.rb', line 116 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise
158 159 160 |
# File 'lib/anemone/page.rb', line 158 def in_domain?(uri) uri.host == @url.host end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/anemone/page.rb', line 59 def links return @links unless @links.nil? @links = [] return @links if !doc doc.css('a').each do |a| u = a.attributes['href'].content rescue nil next if u.nil? or u.empty? abs = to_absolute(URI(u)) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end |
#marshal_dump ⇒ Object
162 163 164 |
# File 'lib/anemone/page.rb', line 162 def marshal_dump [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched] end |
#marshal_load(ary) ⇒ Object
166 167 168 |
# File 'lib/anemone/page.rb', line 166 def marshal_load(ary) @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
132 133 134 |
# File 'lib/anemone/page.rb', line 132 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
124 125 126 |
# File 'lib/anemone/page.rb', line 124 def redirect? (300..399).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
140 141 142 143 144 145 146 147 148 149 150 151 152 |
# File 'lib/anemone/page.rb', line 140 def to_absolute(link) return nil if link.nil? # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) relative = URI(link) absolute = @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |