Class: WebpageArchivist::Fetcher::WebpageRequest
- Inherits:
-
Object
- Object
- WebpageArchivist::Fetcher::WebpageRequest
- Defined in:
- lib/webpage-archivist/fetcher/webpage_request.rb
Overview
Requesting a webpage.
Instance Attribute Summary collapse
-
#instance ⇒ Object
readonly
Returns the value of attribute instance.
-
#result_code ⇒ Object
readonly
Returns the value of attribute result_code.
-
#status ⇒ Object
readonly
Returns the value of attribute status.
-
#uri ⇒ Object
readonly
Returns the value of attribute uri.
-
#webpage ⇒ Object
readonly
Returns the value of attribute webpage.
Instance Method Summary collapse
-
#after_requests ⇒ Object
Process the response once all the elements have been fetched.
-
#initialize(webpage, fetcher_watcher) ⇒ WebpageRequest
constructor
- Create a request webpage
- the Webpage we want to fetch fetcher_watcher
-
to be notified when the request is over.
-
#make_absolute_if_modified(element, link_property) ⇒ Object
- Make an element’s uri absolute element
- the element link_property
-
the property holding the uri.
-
#process_content ⇒ Object
Process the content.
-
#process_response ⇒ Object
Process the response.
-
#request_over(uri) ⇒ Object
- Called by a request when it is over uri
-
the request uri.
-
#start(retries = 3) ⇒ Object
- Start the request Not in initialize so we can throttle the number of connection retries
-
number of retries in case of error.
Constructor Details
#initialize(webpage, fetcher_watcher) ⇒ WebpageRequest
Create a request
- webpage
-
the Webpage we want to fetch
- fetcher_watcher
-
to be notified when the request is over
15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 15 def initialize webpage, fetcher_watcher @webpage = webpage @fetcher_watcher = fetcher_watcher @uri = Addressable::URI.parse(webpage.uri) @waiting_requests = 0 @status = :fetching @plumber = RequestsPlumber.new self end |
Instance Attribute Details
#instance ⇒ Object (readonly)
Returns the value of attribute instance.
10 11 12 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 10 def instance @instance end |
#result_code ⇒ Object (readonly)
Returns the value of attribute result_code.
10 11 12 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 10 def result_code @result_code end |
#status ⇒ Object (readonly)
Returns the value of attribute status.
10 11 12 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 10 def status @status end |
#uri ⇒ Object (readonly)
Returns the value of attribute uri.
10 11 12 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 10 def uri @uri end |
#webpage ⇒ Object (readonly)
Returns the value of attribute webpage.
10 11 12 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 10 def webpage @webpage end |
Instance Method Details
#after_requests ⇒ Object
Process the response once all the elements have been fetched
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 162 def after_requests ::WebpageArchivist.debug "After requests #{@uri}" if ::WebpageArchivist.log if @modified # replace elements with the local uris of the elements @content.each_stylesheet do |stylesheet| if e = @plumber[stylesheet['href']].andand.element stylesheet['href'] = e.file_name end end @content.each_script do |script| if e = @plumber[script['src']].andand.element script['src'] = e.file_name end end @content.each_image do |img| if e = @plumber[img['src']].andand.element img['src'] = e.file_name end end webpage.save_content @content.to_html end = DateTime.now.strftime('%Y/%m/%d %H:%M:%S') # store the content, clean the repo and commit webpage.update_repo_commit_changes @plumber.requests_files, @instance = WebpageArchivist::Instance.create(:webpage => webpage, :commit_timestamp => ) @fetcher_watcher.end_request self, true end |
#make_absolute_if_modified(element, link_property) ⇒ Object
Make an element’s uri absolute
- element
-
the element
- link_property
-
the property holding the uri
142 143 144 145 146 147 148 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 142 def make_absolute_if_modified element, link_property if @modified element[link_property] = uri.absolutize(element[link_property]) else element[link_property] end end |
#process_content ⇒ Object
Process the content
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 77 def process_content if @modified charset = @plumber.response_charset @http @content = WebpageArchivist::HtmlDocument.new(@http.response, @uri, charset) else @content = WebpageArchivist::HtmlDocument.new(webpage.last_content, @uri, webpage.last_encoding) end # Elements from the stylesheets already = Set.new @content.each_stylesheet do |stylesheet| uri = make_absolute_if_modified(stylesheet, 'href') unless already.include? uri already.add uri @plumber.request_element self, WebpageArchivist::Stylesheet, uri @waiting_requests += 1 end end # Elements from the scripts already = Set.new @content.each_script do |script| uri = make_absolute_if_modified(script, 'src') unless already.include? uri already.add uri @plumber.request_element self, WebpageArchivist::Script, uri @waiting_requests += 1 end end # Elements from the images already = Set.new @content.each_image do |img| uri = make_absolute_if_modified(img, 'src') unless already.include? uri already.add uri @plumber.request_element self, WebpageArchivist::Image, uri @waiting_requests += 1 end end if @modified # Make links absolutes @content.each_link do |link| make_absolute_if_modified(link, 'href') end @webpage.update( :last_modified => @plumber.last_modified(@http), :last_content => @content.to_html, :last_charset => @content.charset) end @status = :fetching_requests # No external resource -> end here if @waiting_requests == 0 after_requests end end |
#process_response ⇒ Object
Process the response
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 58 def process_response @result_code = @http.response_header.status ::WebpageArchivist.debug "[#{@uri}] returned #{@result_code}" if ::WebpageArchivist.log if [304, 408, 0].include? @result_code # Not changed ::WebpageArchivist.debug "[#{@uri}] not modified" if ::WebpageArchivist.log @modified = false process_content elsif result_code == 200 ::WebpageArchivist.debug "[#{@uri}] modified" if ::WebpageArchivist.log @modified = true process_content else ::WebpageArchivist.debug "Error #{@uri} #{@result_code}" if ::WebpageArchivist.log @fetcher_watcher.end_request self, false end end |
#request_over(uri) ⇒ Object
Called by a request when it is over
- uri
-
the request uri
152 153 154 155 156 157 158 159 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 152 def request_over uri @waiting_requests -= 1 ::WebpageArchivist.debug "Request over for [#{@uri}] on [#{uri}], missing #{@waiting_requests}" if ::WebpageArchivist.log if (@status == :fetching_requests) && (@waiting_requests <= 0) after_requests end end |
#start(retries = 3) ⇒ Object
Start the request Not in initialize so we can throttle the number of connection
- retries
-
number of retries in case of error
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/webpage-archivist/fetcher/webpage_request.rb', line 31 def start retries = 3 @http = EventMachine::HttpRequest.new(@uri).get :redirects => 5, :timeout => 30, :head => {'If-Modified-Since' => webpage.last_modified, 'accept-encoding' => 'gzip, compressed'} @http.callback do if ([500, 503].include? @http.response_header.status) && (retries > 0) start(retries - 1) else begin process_response rescue Exception => e ::WebpageArchivist.error e if ::WebpageArchivist.log end end end @http.errback do if retries > 0 start(retries - 1) else begin process_response rescue Exception => e ::WebpageArchivist.error e if ::WebpageArchivist.log end end end end |