Class: Medusa::Page

Inherits:
Object
  • Object
show all
Defined in:
lib/medusa/page.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, params = {}) ⇒ Page

Create a new page



36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/medusa/page.rb', line 36

def initialize(url, params = {})
  @url = url
  @data = OpenStruct.new

  @links = nil
  @body = nil
  @doc = nil
  @base = nil

  @code = params[:code]
  @headers = params[:headers] || {}
  @headers['content-type'] ||= ''
  @aliases = Array(params[:aka]).compact
  @referer = params[:referer]
  @depth = params[:depth] || 0
  @redirect_to = to_absolute(params[:redirect_to])
  @response_time = params[:response_time]
  @body = params[:body]
  @error = params[:error]

  @fetched = !params[:code].nil?
end

Instance Attribute Details

#bodyObject (readonly)

The raw HTTP response body of the page



13
14
15
# File 'lib/medusa/page.rb', line 13

def body
  @body
end

#codeObject

Integer response code of the page



24
25
26
# File 'lib/medusa/page.rb', line 24

def code
  @code
end

#dataObject

OpenStruct for user-stored data



22
23
24
# File 'lib/medusa/page.rb', line 22

def data
  @data
end

#depthObject

Depth of this page from the root of the crawl. This is not necessarily the shortest path; use PageStore#shortest_paths! to find that value.



27
28
29
# File 'lib/medusa/page.rb', line 27

def depth
  @depth
end

#errorObject (readonly)

Exception object, if one was raised during HTTP#fetch_page



19
20
21
# File 'lib/medusa/page.rb', line 19

def error
  @error
end

#headersObject (readonly)

Headers of the HTTP response



15
16
17
# File 'lib/medusa/page.rb', line 15

def headers
  @headers
end

#redirect_toObject (readonly)

URL of the page this one redirected to, if any



17
18
19
# File 'lib/medusa/page.rb', line 17

def redirect_to
  @redirect_to
end

#refererObject

URL of the page that brought us to this page



29
30
31
# File 'lib/medusa/page.rb', line 29

def referer
  @referer
end

#response_timeObject

Response time of the request for this page in milliseconds



31
32
33
# File 'lib/medusa/page.rb', line 31

def response_time
  @response_time
end

#urlObject (readonly)

The URL of the page



11
12
13
# File 'lib/medusa/page.rb', line 11

def url
  @url
end

Class Method Details

.from_hash(hash) ⇒ Object



207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/medusa/page.rb', line 207

def self.from_hash(hash)
  page = self.new(URI(hash['url']))
  {'@headers' => Marshal.load(hash['headers']),
   '@data' => Marshal.load(hash['data']),
   '@body' => hash['body'],
   '@links' => hash['links'].map { |link| URI(link) },
   '@code' => hash['code'].to_i,
   '@visited' => hash['visited'],
   '@depth' => hash['depth'].to_i,
   '@referer' => hash['referer'],
   '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
   '@response_time' => hash['response_time'].to_i,
   '@fetched' => hash['fetched']
  }.each do |var, value|
    page.instance_variable_set(var, value)
  end
  page
end

Instance Method Details

#baseObject

Base URI from the HTML doc head element www.w3.org/TR/html4/struct/links.html#edef-BASE



144
145
146
147
148
149
150
151
152
# File 'lib/medusa/page.rb', line 144

def base
  @base = if doc
    href = doc.search('//head/base/@href')
    URI(href.to_s) unless href.nil? rescue nil
  end unless @base

  return nil if @base && @base.to_s().empty?
  @base
end

#content_typeObject

The content-type returned by the HTTP request for this page



112
113
114
# File 'lib/medusa/page.rb', line 112

def content_type
  headers['content-type']
end

#cookiesObject

Array of cookies received with this page as WEBrick::Cookie objects.



105
106
107
# File 'lib/medusa/page.rb', line 105

def cookies
  WEBrick::Cookie.parse_set_cookies(@headers['set-cookie']) rescue []
end

#discard_doc!Object

Delete the Nokogiri document and response body to conserve memory



89
90
91
92
# File 'lib/medusa/page.rb', line 89

def discard_doc!
  links # force parsing of page links before we trash the document
  @doc = @body = nil
end

#docObject

Nokogiri document for the HTML body



81
82
83
84
# File 'lib/medusa/page.rb', line 81

def doc
  return @doc if @doc
  @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
end

#fetched?Boolean

Was the page successfully fetched? true if the page was fetched with no error, false otherwise.

Returns:

  • (Boolean)


98
99
100
# File 'lib/medusa/page.rb', line 98

def fetched?
  @fetched
end

#html?Boolean

Returns true if the page is a HTML document, returns false otherwise.

Returns:

  • (Boolean)


120
121
122
# File 'lib/medusa/page.rb', line 120

def html?
  !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
end

#in_domain?(uri) ⇒ Boolean

Returns true if uri is in the same domain as the page, returns false otherwise

Returns:

  • (Boolean)


180
181
182
# File 'lib/medusa/page.rb', line 180

def in_domain?(uri)
  uri.host == @url.host
end

Array of distinct A tag HREFs from the page



62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/medusa/page.rb', line 62

def links
  return @links unless @links.nil?
  @links = []
  return @links if !doc

  doc.search("//a[@href]").each do |a|
    next if a['data-method'] && a['data-method'] != 'get'
    u = a['href']
    next if u.nil? or u.empty?
    abs = to_absolute(u) rescue next
    @links << abs if in_domain?(abs)
  end
  @links.uniq!
  @links
end

#marshal_dumpObject



184
185
186
# File 'lib/medusa/page.rb', line 184

def marshal_dump
  [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
end

#marshal_load(ary) ⇒ Object



188
189
190
# File 'lib/medusa/page.rb', line 188

def marshal_load(ary)
  @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
end

#not_found?Boolean

Returns true if the page was not found (returned 404 code), returns false otherwise.

Returns:

  • (Boolean)


136
137
138
# File 'lib/medusa/page.rb', line 136

def not_found?
  404 == @code
end

#redirect?Boolean

Returns true if the page is a HTTP redirect, returns false otherwise.

Returns:

  • (Boolean)


128
129
130
# File 'lib/medusa/page.rb', line 128

def redirect?
  (300..307).include?(@code)
end

#to_absolute(link) ⇒ Object

Converts relative URL link into an absolute URL based on the location of the page



159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/medusa/page.rb', line 159

def to_absolute(link)
  return nil if link.nil?

  # remove anchor
  link = link.to_s.gsub(/#.*$/,'')
  if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
    link = URI.encode(URI.decode(link))
  end

  relative = URI(link)
  absolute = base ? base.merge(relative) : @url.merge(relative)

  absolute.path = '/' if absolute.path.empty?

  return absolute
end

#to_hashObject



192
193
194
195
196
197
198
199
200
201
202
203
204
205
# File 'lib/medusa/page.rb', line 192

def to_hash
  {'url' => @url.to_s,
   'headers' => Marshal.dump(@headers),
   'data' => Marshal.dump(@data),
   'body' => @body,
   'links' => links.map(&:to_s),
   'code' => @code,
   'visited' => @visited,
   'depth' => @depth,
   'referer' => @referer.to_s,
   'redirect_to' => @redirect_to.to_s,
   'response_time' => @response_time,
   'fetched' => @fetched}
end