Class: Scruber::QueueAdapters::AbstractAdapter::Page
- Defined in:
- lib/scruber/queue_adapters/abstract_adapter.rb
Overview
Queue page wrapper
Direct Known Subclasses
Instance Attribute Summary collapse
-
#body ⇒ Object
Returns the value of attribute body.
-
#enqueued_at ⇒ Integer
Timestamp added to the queue.
-
#fetched_at ⇒ Integer
Download completion timestamp.
-
#fetcher_agent_id ⇒ Object
ID of FetcherAgent, assigned to this page.
-
#headers ⇒ Hash
Headers for requesting this page.
-
#id ⇒ Object
ID of page.
-
#max_retry_times ⇒ Integer
Max number of download attempts.
-
#method ⇒ String
Request method, post, get, head.
-
#options ⇒ Hash
All options.
-
#page_type ⇒ String
Page type.
-
#priority ⇒ Integer
Priority of page in queue for fetcher.
-
#processed_at ⇒ Integer
Processed by parser timestamp.
-
#proxy_id ⇒ Object
ID of proxy, assigned to this page.
-
#queue ⇒ Scruber::QueueAdapters::AbstractAdapter::Page
Queue object.
-
#response_body ⇒ String
Response body.
-
#response_code ⇒ Integer
Response code.
-
#response_headers ⇒ Hash
Response headers.
-
#response_total_time ⇒ Float
Response total time.
-
#retry_at ⇒ Integer
Minimal timestamp of next retry.
-
#retry_count ⇒ Integer
Number of download attempts.
-
#url ⇒ String
URL of page.
-
#user_agent ⇒ String
Fixed User-Agent for requesting this page.
Instance Method Summary collapse
- #[](k) ⇒ Object
-
#delete ⇒ void
Delete page from queue.
-
#fetcher_agent ⇒ Scruber::Helpers::FetcherAgent
Returns assigned to this page FetcherAgent.
-
#initialize(queue, options = {}) ⇒ Page
constructor
A new instance of Page.
-
#processed! ⇒ void
Mark page as processed by parser and save it.
-
#proxy ⇒ Proxy
Returns assigned to this page proxy.
-
#redownload!(new_retry_count = nil) ⇒ void
Mark page as pending and return to queue.
-
#response_cookies ⇒ Array
Returns cookies from response headers.
- #save ⇒ Object
-
#sent_to_redownload? ⇒ Boolean
Marked as page for redownloading.
-
#url_join(link_url) ⇒ String
Join url of current page with another path or url.
Constructor Details
#initialize(queue, options = {}) ⇒ Page
Returns a new instance of Page.
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 60 def initialize(queue, ={}) @queue = queue = .with_indifferent_access @options = @id = .fetch(:id) { generate_page_id } @url = .fetch(:url) { raise "URL not provided" } @method = .fetch(:method) { :get } @user_agent = .fetch(:user_agent) { nil } @body = .fetch(:body) { nil } @headers = .fetch(:headers) { {} } @fetcher_agent_id = .fetch(:fetcher_agent_id) { nil } @proxy_id = .fetch(:proxy_id) { nil } @response_body = .fetch(:response_body) { nil } @response_code = .fetch(:response_code) { nil } @response_headers = .fetch(:response_headers) { {} } @response_total_time = .fetch(:response_total_time) { nil } @retry_at = .fetch(:retry_at) { 0 } @fetched_at = .fetch(:fetched_at) { 0 } @retry_count = .fetch(:retry_count) { 0 } @max_retry_times = .fetch(:max_retry_times) { nil } @enqueued_at = .fetch(:enqueued_at) { 0 } @page_type = .fetch(:page_type) { :seed } # @queue = options.fetch(:queue) { 'default' } @priority = .fetch(:priority) { 0 } @processed_at = .fetch(:processed_at) { 0 } @_fetcher_agent = false @_proxy = false @_redownload = false end |
Instance Attribute Details
#body ⇒ Object
Returns the value of attribute body.
37 38 39 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 37 def body @body end |
#enqueued_at ⇒ Integer
Timestamp added to the queue
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def enqueued_at @enqueued_at end |
#fetched_at ⇒ Integer
Download completion timestamp
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def fetched_at @fetched_at end |
#fetcher_agent_id ⇒ Object
ID of FetcherAgent, assigned to this page
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def fetcher_agent_id @fetcher_agent_id end |
#headers ⇒ Hash
Headers for requesting this page
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def headers @headers end |
#id ⇒ Object
ID of page. Will be autogenerated if not passed
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def id @id end |
#max_retry_times ⇒ Integer
Max number of download attempts
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def max_retry_times @max_retry_times end |
#method ⇒ String
Request method, post, get, head
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def method @method end |
#options ⇒ Hash
All options
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def @options end |
#page_type ⇒ String
Page type
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def page_type @page_type end |
#priority ⇒ Integer
Priority of page in queue for fetcher
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def priority @priority end |
#processed_at ⇒ Integer
Processed by parser timestamp
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def processed_at @processed_at end |
#proxy_id ⇒ Object
ID of proxy, assigned to this page
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def proxy_id @proxy_id end |
#queue ⇒ Scruber::QueueAdapters::AbstractAdapter::Page
Queue object
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def queue @queue end |
#response_body ⇒ String
Response body
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def response_body @response_body end |
#response_code ⇒ Integer
Response code
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def response_code @response_code end |
#response_headers ⇒ Hash
Response headers
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def response_headers @response_headers end |
#response_total_time ⇒ Float
Response total time
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def response_total_time @response_total_time end |
#retry_at ⇒ Integer
Minimal timestamp of next retry
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def retry_at @retry_at end |
#retry_count ⇒ Integer
Number of download attempts
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def retry_count @retry_count end |
#url ⇒ String
URL of page
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def url @url end |
#user_agent ⇒ String
Fixed User-Agent for requesting this page
36 37 38 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 36 def user_agent @user_agent end |
Instance Method Details
#[](k) ⇒ Object
147 148 149 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 147 def [](k) instance_variable_get("@#{k.to_s}") end |
#delete ⇒ void
This method returns an undefined value.
Delete page from queue
155 156 157 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 155 def delete raise NotImplementedError end |
#fetcher_agent ⇒ Scruber::Helpers::FetcherAgent
Returns assigned to this page FetcherAgent
96 97 98 99 100 101 102 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 96 def fetcher_agent if @_fetcher_agent == false @_fetcher_agent = (@fetcher_agent_id ? Scruber::Helpers::FetcherAgent.find(@fetcher_agent_id) : nil) else @_fetcher_agent end end |
#processed! ⇒ void
This method returns an undefined value.
Mark page as processed by parser and save it
163 164 165 166 167 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 163 def processed! @processed_at = Time.now.to_i @_redownload = false save end |
#proxy ⇒ Proxy
Returns assigned to this page proxy
108 109 110 111 112 113 114 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 108 def proxy if @_proxy == false @_proxy = (@proxy_id ? Scruber::Helpers::ProxyRotator.find(@proxy_id) : nil) else @_proxy end end |
#redownload!(new_retry_count = nil) ⇒ void
This method returns an undefined value.
Mark page as pending and return to queue
175 176 177 178 179 180 181 182 183 184 185 186 187 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 175 def redownload!(new_retry_count=nil) @_redownload = true @processed_at = 0 if new_retry_count @retry_count = new_retry_count else @retry_count += 1 end @fetched_at = 0 @response_body = nil save end |
#response_cookies ⇒ Array
Returns cookies from response headers
120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 120 def = self.response_headers['Set-Cookie'] if .blank? [] else if .is_a?(Array) else [] end end end |
#save ⇒ Object
133 134 135 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 133 def save raise NotImplementedError end |
#sent_to_redownload? ⇒ Boolean
Marked as page for redownloading
193 194 195 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 193 def sent_to_redownload? @_redownload end |
#url_join(link_url) ⇒ String
Join url of current page with another path or url
143 144 145 |
# File 'lib/scruber/queue_adapters/abstract_adapter.rb', line 143 def url_join(link_url) URI.join(url, link_url).to_s end |