Class: Creepin::CollectionCreeper
- Inherits:
-
Object
- Object
- Creepin::CollectionCreeper
- Defined in:
- lib/creepin/collection_creeper.rb
Instance Attribute Summary collapse
-
#finished_at ⇒ Object
Returns the value of attribute finished_at.
-
#loaded_collection ⇒ Object
Returns the value of attribute loaded_collection.
-
#requested_urls ⇒ Object
Returns the value of attribute requested_urls.
-
#started_at ⇒ Object
Returns the value of attribute started_at.
-
#stats ⇒ Object
Returns the value of attribute stats.
-
#total_pages ⇒ Object
Returns the value of attribute total_pages.
-
#total_records ⇒ Object
Returns the value of attribute total_records.
Instance Method Summary collapse
- #after_collection_loaded_callbacks? ⇒ Boolean
- #after_crawl_finished_callbacks? ⇒ Boolean
- #before_crawl_finished_callbacks? ⇒ Boolean
- #build_request_params(param_string) ⇒ Object
- #crawl_next_page ⇒ Object
- #default_params? ⇒ Boolean
- #full_request_url(base_url, request_params) ⇒ Object
-
#initialize(params = {}) ⇒ CollectionCreeper
constructor
A new instance of CollectionCreeper.
- #load_resource(collected_attributes_hash, resource_klass) ⇒ Object
- #load_response_collection ⇒ Object
- #map_response_collection ⇒ Object
- #next_page? ⇒ Boolean
- #parse_response ⇒ Object
- #resource_load_strategy? ⇒ Boolean
- #resource_save_strategy? ⇒ Boolean
- #response_collection? ⇒ Boolean
- #run_after_collection_loaded_callbacks ⇒ Object
- #run_after_crawl_callbacks ⇒ Object
- #run_after_crawl_finished_callbacks ⇒ Object
- #save_resource(collected_attributes_hash, resource) ⇒ Object
- #skip_resource_save? ⇒ Boolean
- #transmit ⇒ Object
Constructor Details
#initialize(params = {}) ⇒ CollectionCreeper
Returns a new instance of CollectionCreeper.
6 7 8 9 10 11 12 13 |
# File 'lib/creepin/collection_creeper.rb', line 6 def initialize(params = {}) @params ||= {} @params = params if params.present? @total_records ||= 0 @total_pages ||= 0 @loaded_collection ||= [] @requested_urls ||= [] end |
Instance Attribute Details
#finished_at ⇒ Object
Returns the value of attribute finished_at.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def finished_at @finished_at end |
#loaded_collection ⇒ Object
Returns the value of attribute loaded_collection.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def loaded_collection @loaded_collection end |
#requested_urls ⇒ Object
Returns the value of attribute requested_urls.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def requested_urls @requested_urls end |
#started_at ⇒ Object
Returns the value of attribute started_at.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def started_at @started_at end |
#stats ⇒ Object
Returns the value of attribute stats.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def stats @stats end |
#total_pages ⇒ Object
Returns the value of attribute total_pages.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def total_pages @total_pages end |
#total_records ⇒ Object
Returns the value of attribute total_records.
4 5 6 |
# File 'lib/creepin/collection_creeper.rb', line 4 def total_records @total_records end |
Instance Method Details
#after_collection_loaded_callbacks? ⇒ Boolean
37 38 39 |
# File 'lib/creepin/collection_creeper.rb', line 37 def after_collection_loaded_callbacks? (respond_to?(:after_collection_loaded_callbacks) && !after_collection_loaded_callbacks.empty?) ? true : false end |
#after_crawl_finished_callbacks? ⇒ Boolean
29 30 31 |
# File 'lib/creepin/collection_creeper.rb', line 29 def after_crawl_finished_callbacks? (respond_to?(:after_crawl_finished_callbacks) && !after_crawl_finished_callbacks.empty?) ? true : false end |
#before_crawl_finished_callbacks? ⇒ Boolean
33 34 35 |
# File 'lib/creepin/collection_creeper.rb', line 33 def before_crawl_finished_callbacks? (respond_to?(:before_crawl_finished_callbacks) && !before_crawl_finished_callbacks.empty?) ? true : false end |
#build_request_params(param_string) ⇒ Object
57 58 59 60 |
# File 'lib/creepin/collection_creeper.rb', line 57 def build_request_params(param_string) params_hash = Rack::Utils.parse_query(param_string.split('?').pop) @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present? end |
#crawl_next_page ⇒ Object
41 42 43 44 45 46 47 |
# File 'lib/creepin/collection_creeper.rb', line 41 def crawl_next_page if next_page? crawl else collection_loaded end end |
#default_params? ⇒ Boolean
108 109 110 |
# File 'lib/creepin/collection_creeper.rb', line 108 def default_params? respond_to?(:default_params) end |
#full_request_url(base_url, request_params) ⇒ Object
62 63 64 |
# File 'lib/creepin/collection_creeper.rb', line 62 def full_request_url(base_url, request_params) base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?') end |
#load_resource(collected_attributes_hash, resource_klass) ⇒ Object
80 81 82 83 84 85 86 |
# File 'lib/creepin/collection_creeper.rb', line 80 def load_resource(collected_attributes_hash, resource_klass) if resource_load_strategy? resource_load_strategy.call(collected_attributes_hash, resource_klass) else resource_klass.new(collected_attributes_hash) end end |
#load_response_collection ⇒ Object
72 73 74 |
# File 'lib/creepin/collection_creeper.rb', line 72 def load_response_collection @response_collection = @response_html.document.css(selector) end |
#map_response_collection ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/creepin/collection_creeper.rb', line 112 def map_response_collection @response_collection.each do |ele| collected_attributes_hash = {} element_mappings.each_pair do |attribute, block| value = instance_exec(ele, &block) collected_attributes_hash[attribute] = value end resource = load_resource(collected_attributes_hash, resource_class.constantize) @total_records += 1 resource = save_resource(collected_attributes_hash, resource) loaded_collection << resource end end |
#next_page? ⇒ Boolean
126 127 128 129 130 131 132 133 134 135 136 137 138 |
# File 'lib/creepin/collection_creeper.rb', line 126 def next_page? return false if next_page_selector.nil? if next_page_selector.is_a?(Proc) next_page_url = instance_exec(@response_html.document, &next_page_selector) build_request_params(next_page_url) if next_page_url.present? @has_next_page = next_page_url.present? else next_page_url = @response_html.document.at_css(next_page_selector) build_request_params(next_page_url) if next_page_url.present? @has_next_page = next_page_url.present? end @has_next_page end |
#parse_response ⇒ Object
66 67 68 69 70 |
# File 'lib/creepin/collection_creeper.rb', line 66 def parse_response @response_html = Nokogiri::HTML::Document.parse(@response.body) load_response_collection map_response_collection if response_collection? end |
#resource_load_strategy? ⇒ Boolean
100 101 102 |
# File 'lib/creepin/collection_creeper.rb', line 100 def resource_load_strategy? respond_to?(:resource_load_strategy) end |
#resource_save_strategy? ⇒ Boolean
104 105 106 |
# File 'lib/creepin/collection_creeper.rb', line 104 def resource_save_strategy? respond_to?(:resource_save_strategy) end |
#response_collection? ⇒ Boolean
76 77 78 |
# File 'lib/creepin/collection_creeper.rb', line 76 def response_collection? @response_collection.present? end |
#run_after_collection_loaded_callbacks ⇒ Object
25 26 27 |
# File 'lib/creepin/collection_creeper.rb', line 25 def run_after_collection_loaded_callbacks after_collection_loaded_callbacks.each{ |callback| callback.call(self) } if after_collection_loaded_callbacks? end |
#run_after_crawl_callbacks ⇒ Object
15 16 17 |
# File 'lib/creepin/collection_creeper.rb', line 15 def run_after_crawl_callbacks transmit end |
#run_after_crawl_finished_callbacks ⇒ Object
19 20 21 22 23 |
# File 'lib/creepin/collection_creeper.rb', line 19 def run_after_crawl_finished_callbacks parse_response after_crawl_finished_callbacks.each{ |callback| callback.call(self) } if after_crawl_finished_callbacks? crawl_next end |
#save_resource(collected_attributes_hash, resource) ⇒ Object
88 89 90 91 92 93 94 |
# File 'lib/creepin/collection_creeper.rb', line 88 def save_resource(collected_attributes_hash, resource) if resource_save_strategy? resource_save_strategy.call(collected_attributes_hash, resource) else resource.save unless skip_resource_save? end end |
#skip_resource_save? ⇒ Boolean
96 97 98 |
# File 'lib/creepin/collection_creeper.rb', line 96 def skip_resource_save? respond_to?(:skip_resource_save) end |
#transmit ⇒ Object
49 50 51 52 53 54 55 |
# File 'lib/creepin/collection_creeper.rb', line 49 def transmit @request_params ||= (default_params? ? {:query => default_params.merge(@params) } : {:query => @params } ) @response = HTTParty.get(base_url, @request_params) @requested_urls << full_request_url(base_url, @request_params) @total_pages += 1 crawl_finished end |