Class: Anemone::Page
- Inherits:
-
Object
- Object
- Anemone::Page
- Defined in:
- lib/anemone/page.rb
Instance Attribute Summary collapse
-
#aliases ⇒ Object
Array of redirect-aliases for the page.
-
#code ⇒ Object
Integer response code of the page.
-
#data ⇒ Object
OpenStruct for user-stored data.
-
#depth ⇒ Object
Depth of this page from the root of the crawl.
-
#doc ⇒ Object
Nokogiri document for the HTML body.
-
#headers ⇒ Object
readonly
Headers of the HTTP response.
-
#referer ⇒ Object
URL of the page that brought us to this page.
-
#response_time ⇒ Object
Response time of the request for this page in milliseconds.
-
#url ⇒ Object
readonly
The URL of the page.
-
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!.
Instance Method Summary collapse
-
#add_alias!(aka) ⇒ Object
Add a redirect-alias String aka to the list of the page’s aliases.
-
#alias_clone(url) ⇒ Object
Return a new page with the same response and url, but with a 200 response code.
-
#content_type ⇒ Object
The content-type returned by the HTTP request for this page.
- #discard_doc! ⇒ Object
-
#html? ⇒ Boolean
Returns
true
if the page is a HTML document, returnsfalse
otherwise. -
#in_domain?(uri) ⇒ Boolean
Returns
true
if uri is in the same domain as the page, returnsfalse
otherwise. -
#initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) ⇒ Page
constructor
Create a new page.
-
#links ⇒ Object
Array of distinct A tag HREFs from the page.
-
#links_and_their_aliases(page_hash) ⇒ Object
Returns an Array of all links from this page, and all the redirect-aliases of those pages, as String objects.
-
#not_found? ⇒ Boolean
Returns
true
if the page was not found (returned 404 code), returnsfalse
otherwise. -
#redirect? ⇒ Boolean
Returns
true
if the page is a HTTP redirect, returnsfalse
otherwise. -
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page.
Constructor Details
#initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) ⇒ Page
Create a new page
33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/anemone/page.rb', line 33 def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) @url = url @code = code @headers = headers || {} @headers['content-type'] ||= [''] @aliases = Array(aka) @data = OpenStruct.new @referer = referer @depth = depth || 0 @response_time = response_time @doc = Nokogiri::HTML(body) if body && html? rescue nil end |
Instance Attribute Details
#aliases ⇒ Object
Array of redirect-aliases for the page
19 20 21 |
# File 'lib/anemone/page.rb', line 19 def aliases @aliases end |
#code ⇒ Object
Integer response code of the page
17 18 19 |
# File 'lib/anemone/page.rb', line 17 def code @code end |
#data ⇒ Object
OpenStruct for user-stored data
13 14 15 |
# File 'lib/anemone/page.rb', line 13 def data @data end |
#depth ⇒ Object
Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageHash#shortest_paths! to find that value.
24 25 26 |
# File 'lib/anemone/page.rb', line 24 def depth @depth end |
#doc ⇒ Object
Nokogiri document for the HTML body
15 16 17 |
# File 'lib/anemone/page.rb', line 15 def doc @doc end |
#headers ⇒ Object (readonly)
Headers of the HTTP response
10 11 12 |
# File 'lib/anemone/page.rb', line 10 def headers @headers end |
#referer ⇒ Object
URL of the page that brought us to this page
26 27 28 |
# File 'lib/anemone/page.rb', line 26 def referer @referer end |
#response_time ⇒ Object
Response time of the request for this page in milliseconds
28 29 30 |
# File 'lib/anemone/page.rb', line 28 def response_time @response_time end |
#url ⇒ Object (readonly)
The URL of the page
8 9 10 |
# File 'lib/anemone/page.rb', line 8 def url @url end |
#visited ⇒ Object
Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
21 22 23 |
# File 'lib/anemone/page.rb', line 21 def visited @visited end |
Instance Method Details
#add_alias!(aka) ⇒ Object
Add a redirect-alias String aka to the list of the page’s aliases
Returns self
83 84 85 86 |
# File 'lib/anemone/page.rb', line 83 def add_alias!(aka) @aliases << aka if !@aliases.include?(aka) self end |
#alias_clone(url) ⇒ Object
Return a new page with the same response and url, but with a 200 response code
71 72 73 74 75 76 |
# File 'lib/anemone/page.rb', line 71 def alias_clone(url) p = clone p.add_alias!(@aka) if !@aka.nil? p.code = 200 p end |
#content_type ⇒ Object
The content-type returned by the HTTP request for this page
103 104 105 |
# File 'lib/anemone/page.rb', line 103 def content_type headers['content-type'].first end |
#discard_doc! ⇒ Object
62 63 64 65 |
# File 'lib/anemone/page.rb', line 62 def discard_doc! links # force parsing of page links before we trash the document @doc = nil end |
#html? ⇒ Boolean
Returns true
if the page is a HTML document, returns false
otherwise.
111 112 113 |
# File 'lib/anemone/page.rb', line 111 def html? !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b}) end |
#in_domain?(uri) ⇒ Boolean
Returns true
if uri is in the same domain as the page, returns false
otherwise
151 152 153 |
# File 'lib/anemone/page.rb', line 151 def in_domain?(uri) uri.host == @url.host end |
#links ⇒ Object
Array of distinct A tag HREFs from the page
47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# File 'lib/anemone/page.rb', line 47 def links return @links unless @links.nil? @links = [] return @links if !doc doc.css('a').each do |a| u = a.attributes['href'].content rescue nil next if u.nil? or u.empty? abs = to_absolute(URI(u)) rescue next @links << abs if in_domain?(abs) end @links.uniq! @links end |
#links_and_their_aliases(page_hash) ⇒ Object
Returns an Array of all links from this page, and all the redirect-aliases of those pages, as String objects.
page_hash is a PageHash object with the results of the current crawl.
94 95 96 97 98 |
# File 'lib/anemone/page.rb', line 94 def links_and_their_aliases(page_hash) links.inject([]) do |results, link| results.concat([link].concat(page_hash[link].aliases)) end end |
#not_found? ⇒ Boolean
Returns true
if the page was not found (returned 404 code), returns false
otherwise.
127 128 129 |
# File 'lib/anemone/page.rb', line 127 def not_found? 404 == @code end |
#redirect? ⇒ Boolean
Returns true
if the page is a HTTP redirect, returns false
otherwise.
119 120 121 |
# File 'lib/anemone/page.rb', line 119 def redirect? (300..399).include?(@code) end |
#to_absolute(link) ⇒ Object
Converts relative URL link into an absolute URL based on the location of the page
135 136 137 138 139 140 141 142 143 144 145 |
# File 'lib/anemone/page.rb', line 135 def to_absolute(link) # remove anchor link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')) relative = URI(link) absolute = @url.merge(relative) absolute.path = '/' if absolute.path.empty? return absolute end |