Class: Anemone::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/anemone/page.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil) ⇒ Page

Create a new page



33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/anemone/page.rb', line 33

def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
  @url = url
  @code = code
  @headers = headers || {}
  @headers['content-type'] ||= ['']
  @aliases = Array(aka)
  @data = OpenStruct.new
  @referer = referer
  @depth = depth || 0
  @response_time = response_time
  @doc = Nokogiri::HTML(body) if body && html? rescue nil
end

Instance Attribute Details

#aliasesObject

Array of redirect-aliases for the page



19
20
21
# File 'lib/anemone/page.rb', line 19

def aliases
  @aliases
end

#codeObject

Integer response code of the page



17
18
19
# File 'lib/anemone/page.rb', line 17

def code
  @code
end

#dataObject

OpenStruct for user-stored data



13
14
15
# File 'lib/anemone/page.rb', line 13

def data
  @data
end

#depthObject

Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageHash#shortest_paths! to find that value.



24
25
26
# File 'lib/anemone/page.rb', line 24

def depth
  @depth
end

#docObject

Nokogiri document for the HTML body



15
16
17
# File 'lib/anemone/page.rb', line 15

def doc
  @doc
end

#headersObject (readonly)

Headers of the HTTP response



10
11
12
# File 'lib/anemone/page.rb', line 10

def headers
  @headers
end

#refererObject

URL of the page that brought us to this page



26
27
28
# File 'lib/anemone/page.rb', line 26

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



28
29
30
# File 'lib/anemone/page.rb', line 28

def response_time
  @response_time
end

#urlObject (readonly)

The URL of the page



8
9
10
# File 'lib/anemone/page.rb', line 8

def url
  @url
end

#visitedObject

Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!



21
22
23
# File 'lib/anemone/page.rb', line 21

def visited
  @visited
end

Instance Method Details

#add_alias!(aka) ⇒ Object

Add a redirect-alias String aka to the list of the page’s aliases

Returns self



83
84
85
86
# File 'lib/anemone/page.rb', line 83

def add_alias!(aka)
  @aliases << aka if !@aliases.include?(aka)
  self
end

#alias_clone(url) ⇒ Object

Return a new page with the same response and url, but with a 200 response code



71
72
73
74
75
76
# File 'lib/anemone/page.rb', line 71

def alias_clone(url)
  p = clone
	  p.add_alias!(@aka) if !@aka.nil?
	  p.code = 200
	  p
end

#content_typeObject

The content-type returned by the HTTP request for this page



103
104
105
# File 'lib/anemone/page.rb', line 103

def content_type
  headers['content-type'].first
end

#discard_doc!Object



62
63
64
65
# File 'lib/anemone/page.rb', line 62

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = nil
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


111
112
113
# File 'lib/anemone/page.rb', line 111

def html?
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


151
152
153
# File 'lib/anemone/page.rb', line 151

def in_domain?(uri)
  uri.host == @url.host
end

Array of distinct A tag HREFs from the page



47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/anemone/page.rb', line 47

def links
  return @links unless @links.nil?
  @links = []
  return @links if !doc
  
  doc.css('a').each do |a|
    u = a.attributes['href'].content rescue nil
    next if u.nil? or u.empty?
    abs = to_absolute(URI(u)) rescue next
    @links << abs if in_domain?(abs)
  end
  @links.uniq!
  @links
end

Returns an Array of all links from this page, and all the redirect-aliases of those pages, as String objects.

page_hash is a PageHash object with the results of the current crawl.



94
95
96
97
98
# File 'lib/anemone/page.rb', line 94

def links_and_their_aliases(page_hash)
  links.inject([]) do |results, link|
    results.concat([link].concat(page_hash[link].aliases))
  end
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


127
128
129
# File 'lib/anemone/page.rb', line 127

def not_found?
  404 == @code
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


119
120
121
# File 'lib/anemone/page.rb', line 119

def redirect?
  (300..399).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



135
136
137
138
139
140
141
142
143
144
145
# File 'lib/anemone/page.rb', line 135

def to_absolute(link)
  # remove anchor
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))

  relative = URI(link)
  absolute = @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  return absolute
end