Class: WebpageArchivist::Fetcher::ElementRequest

Inherits:
Object
  • Object
show all
Defined in:
lib/webpage-archivist/fetcher/element_request.rb

Overview

Requesting a webpage

Direct Known Subclasses

StyleSheetRequest

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(request_element_type, uri, plumber) ⇒ ElementRequest

Returns a new instance of ElementRequest.



10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 10

def initialize request_element_type, uri, plumber
  ::WebpageArchivist.debug "Creating request [#{uri}]" if ::WebpageArchivist.log

  @requesters = []
  @request_element_type = request_element_type
  @uri = uri
  @element = request_element_type.filter(:uri => uri).first
  @status = :fetching
  @plumber = plumber

  @requesters_notified = false
end

Instance Attribute Details

#elementObject (readonly)

Returns the value of attribute element.



6
7
8
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 6

def element
  @element
end

#statusObject

Returns the value of attribute status.



8
9
10
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 8

def status
  @status
end

#uriObject (readonly)

Returns the value of attribute uri.



6
7
8
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 6

def uri
  @uri
end

Instance Method Details

#add_requester(requester) ⇒ Object

Add a requester to be notified when the request is over



66
67
68
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 66

def add_requester requester
  @requesters << requester
end

#content_not_changed(http) ⇒ Object

Content has not changed since last fetch



102
103
104
105
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 102

def content_not_changed http
  self.status= :over
  notify_requesters
end

#notify_requestersObject



116
117
118
119
120
121
122
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 116

def notify_requesters
  unless @requesters_notified
    ::WebpageArchivist.debug "[#{@uri}] notify #{@requesters.length}" if ::WebpageArchivist.log
    @requesters.each { |r| r.request_over(@uri) }
    @requester_notified = true
  end
end

#process_response(http) ⇒ Object

Process the response



71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 71

def process_response http
  result_code = http.response_header.status
  ::WebpageArchivist.debug "[#{@uri}] returned #{result_code}" if ::WebpageArchivist.log

  if [304, 408, 0].include? result_code
    # Not changed or connection error
    if element
      element.update(:last_fetched => DateTime.now)
    end
    content_not_changed http
  elsif result_code == 200
    if element
      element.update(:last_fetched => DateTime.now,
                     :last_modified => http.response_header.last_modified || DateTime.now.rfc2822)
    else
      extension = @request_element_type.extention(@uri, http.response_header[EventMachine::HttpClient::CONTENT_TYPE])
      @element = @request_element_type.create(:webpage => @plumber.webpage,
                                              :uri => @uri,
                                              :file_hash => @plumber.get_hash(@request_element_type, @uri, extension),
                                              :extension => extension,
                                              :last_fetched => DateTime.now,
                                              :last_modified => @plumber.last_modified(http))
    end
    save_content_end_request http
  else
    self.status= :over
    notify_requesters
  end
end

#save_content_end_request(http) ⇒ Object

Content has changed: save the content and end the request



108
109
110
111
112
113
114
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 108

def save_content_end_request http
  ::WebpageArchivist.debug "[#{@uri}] writing content to #{element.file_name}" if ::WebpageArchivist.log

  element.save_content http.response
  self.status= :over
  notify_requesters
end

#start(retries = 3) ⇒ Object

Start the request Not in initialize so we can register the request before executing it and throttle the number of connections

retries

number of retries in case of error



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 26

def start retries = 3
  ::WebpageArchivist.debug "Starting request [#{uri}]" if ::WebpageArchivist.log
  head = {'accept-encoding' => 'gzip, compressed'}
  if element
    head['If-Modified-Since'] = element.last_modified
    @plumber.register_file_name element.file_name
  end

  # Don't overflow the servers or they will kick us out
  http = EventMachine::HttpRequest.new(uri).get :redirects => 5, :timeout => 30, :head => head
  http.callback do
    if ([500, 503].include? http.response_header.status) && (retries > 0)
      start(retries - 1)
    else
      will_process_response http
    end
  end
  http.errback do
    ::WebpageArchivist.debug "[#{@uri}] errback" if ::WebpageArchivist.log
    if retries > 0
      start(retries - 1)
    else
      will_process_response http
    end
  end
end

#will_process_response(http) ⇒ Object

Call process_response and ensure managment is done



54
55
56
57
58
59
60
61
62
63
# File 'lib/webpage-archivist/fetcher/element_request.rb', line 54

def will_process_response http
  begin
    process_response http
  rescue Exception => e
    ::WebpageArchivist.error e if ::WebpageArchivist.log
    notify_requesters
  ensure
    @plumber.request_ended
  end
end