Class: WebpageArchivist::Fetcher::RequestsPlumber

Inherits:
Object
  • Object
show all
Defined in:
lib/webpage-archivist/fetcher/requests_plumber.rb

Overview

Contains the plumbing for the fetching code

Constant Summary collapse

MAX_RUNNING_REQUESTS =
(ENV['ARCHIVIST_MAX_RUNNING_REQUESTS'].andand.to_i || 20)
@@next_tick =
false
@@waiting_requests =
[]
@@running_requests =
0

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(webpage_request) ⇒ RequestsPlumber

Returns a new instance of RequestsPlumber.



16
17
18
19
20
21
22
23
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 16

def initialize webpage_request
  @requests = {}
  @requests_hashes = Set.new
  @requests_files = Set.new
  @webpage_request = webpage_request
  @@waiting_requests << webpage_request
  RequestsPlumber.new_request
end

Instance Attribute Details

#requests_filesObject (readonly)

Returns the value of attribute requests_files.



10
11
12
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 10

def requests_files
  @requests_files
end

Instance Method Details

#[](key) ⇒ Object

Access an element request by its uri



31
32
33
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 31

def [] key
  @requests[key]
end

#get_hash(type, uri, extension) ⇒ Object

Get the has corresponding to an uri and make sure there is no collision



77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 77

def get_hash type, uri, extension
  file_hash = Digest::SHA1.hexdigest(uri)
  if @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
    i = 0
    begin
      file_hash = Digest::SHA1.hexdigest("#{uri}#{i}")
      i += 1
    end while @requests_hashes.include?(file_hash) || (type.filter(:file_hash => file_hash).count > 0)
  end

  @requests_files << "#{file_hash}#{extension}"
  file_hash
end

#last_modified(http) ⇒ Object



107
108
109
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 107

def last_modified http
  http.response_header.last_modified || DateTime.now.rfc2822
end

#register_file_name(file_name) ⇒ Object

Register a filename so it is considered part of the webpage



92
93
94
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 92

def register_file_name file_name
  @requests_files << file_name
end

#request_element(requester, request_element_type, uri) ⇒ Object

Request an element to be fetched When the fetch is called, request_over will be called on the requester with the uri

requester

the element doing the request

type

the type the type of requested element

uri

the requested uri



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 40

def request_element requester, request_element_type, uri
  ::WebpageArchivist.debug "Requesting [#{uri}] for [#{requester.uri}]" if ::WebpageArchivist.log

  if request = @requests[uri]
    if request.status == :over
      ::WebpageArchivist.debug "Request already done" if ::WebpageArchivist.log
      requester.request_over uri
    else
      ::WebpageArchivist.debug "Adding to requesters" if ::WebpageArchivist.log
      request.add_requester requester
    end
  else
    ::WebpageArchivist.debug "Creating new request" if ::WebpageArchivist.log
    if request_element_type == WebpageArchivist::Stylesheet
      request = StyleSheetRequest.new(uri, self)
    else
      request = ElementRequest.new(request_element_type, uri, self)
    end
    @requests[uri] = request
    request.add_requester requester

    @@waiting_requests << request

    # try registering for the next tick
    RequestsPlumber.new_request
  end
end

#request_endedObject

Notify that a requst as ended so it can start another one



69
70
71
72
73
74
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 69

def request_ended
  @@running_requests -= 1
  unless @@waiting_requests.empty?
    RequestsPlumber.new_request
  end
end

#response_charset(http) ⇒ Object

Get the charset of a response, may be nil



97
98
99
100
101
102
103
104
105
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 97

def response_charset http
  type = http.response_header[EventMachine::HttpClient::CONTENT_TYPE]
  if type
    match = /.+;\s*charset=(.+)/i.match(type)
    if match
      match[1].upcase
    end
  end
end

#webpageObject

The page being fetched



26
27
28
# File 'lib/webpage-archivist/fetcher/requests_plumber.rb', line 26

def webpage
  @webpage_request.webpage
end