Class: Creepin::CollectionCreeper

Inherits:
Object
  • Object
show all
Defined in:
lib/creepin/collection_creeper.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(params = {}) ⇒ CollectionCreeper

Returns a new instance of CollectionCreeper.



6
7
8
9
10
11
12
13
# File 'lib/creepin/collection_creeper.rb', line 6

def initialize(params = {})
  @params ||= {}
  @params = params if params.present?
  @total_records ||= 0
  @total_pages ||= 0
  @loaded_collection ||= []
  @requested_urls ||= []
end

Instance Attribute Details

#finished_atObject

Returns the value of attribute finished_at.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def finished_at
  @finished_at
end

#loaded_collectionObject

Returns the value of attribute loaded_collection.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def loaded_collection
  @loaded_collection
end

#requested_urlsObject

Returns the value of attribute requested_urls.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def requested_urls
  @requested_urls
end

#started_atObject

Returns the value of attribute started_at.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def started_at
  @started_at
end

#statsObject

Returns the value of attribute stats.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def stats
  @stats
end

#total_pagesObject

Returns the value of attribute total_pages.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def total_pages
  @total_pages
end

#total_recordsObject

Returns the value of attribute total_records.



4
5
6
# File 'lib/creepin/collection_creeper.rb', line 4

def total_records
  @total_records
end

Instance Method Details

#after_collection_loaded_callbacks?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/creepin/collection_creeper.rb', line 37

def after_collection_loaded_callbacks?
  (respond_to?(:after_collection_loaded_callbacks) && !after_collection_loaded_callbacks.empty?) ? true : false
end

#after_crawl_finished_callbacks?Boolean

Returns:

  • (Boolean)


29
30
31
# File 'lib/creepin/collection_creeper.rb', line 29

def after_crawl_finished_callbacks?
  (respond_to?(:after_crawl_finished_callbacks) && !after_crawl_finished_callbacks.empty?) ? true : false
end

#before_crawl_finished_callbacks?Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/creepin/collection_creeper.rb', line 33

def before_crawl_finished_callbacks?
  (respond_to?(:before_crawl_finished_callbacks) && !before_crawl_finished_callbacks.empty?) ? true : false
end

#build_request_params(param_string) ⇒ Object



57
58
59
60
# File 'lib/creepin/collection_creeper.rb', line 57

def build_request_params(param_string)
  params_hash = Rack::Utils.parse_query(param_string.split('?').pop)
  @request_params = { :query => params_hash.with_indifferent_access } if params_hash.present?
end

#crawl_next_pageObject



41
42
43
44
45
46
47
# File 'lib/creepin/collection_creeper.rb', line 41

def crawl_next_page
  if next_page?
    crawl
  else
    collection_loaded
  end
end

#default_params?Boolean

Returns:

  • (Boolean)


108
109
110
# File 'lib/creepin/collection_creeper.rb', line 108

def default_params?
  respond_to?(:default_params)
end

#full_request_url(base_url, request_params) ⇒ Object



62
63
64
# File 'lib/creepin/collection_creeper.rb', line 62

def full_request_url(base_url, request_params)
  base_url + request_params[:query].map{|k,v| "#{k}=#{v}"}.join("&").insert(0, '?')
end

#load_resource(collected_attributes_hash, resource_klass) ⇒ Object



80
81
82
83
84
85
86
# File 'lib/creepin/collection_creeper.rb', line 80

def load_resource(collected_attributes_hash, resource_klass)
  if resource_load_strategy?
    resource_load_strategy.call(collected_attributes_hash, resource_klass)
  else
    resource_klass.new(collected_attributes_hash)
  end
end

#load_response_collectionObject



72
73
74
# File 'lib/creepin/collection_creeper.rb', line 72

def load_response_collection
  @response_collection = @response_html.document.css(selector)
end

#map_response_collectionObject



112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/creepin/collection_creeper.rb', line 112

def map_response_collection
  @response_collection.each do |ele|
    collected_attributes_hash = {}
    element_mappings.each_pair do |attribute, block|
      value = instance_exec(ele, &block)
      collected_attributes_hash[attribute] = value
    end
    resource = load_resource(collected_attributes_hash, resource_class.constantize)
    @total_records += 1
    resource = save_resource(collected_attributes_hash, resource)
    loaded_collection << resource
  end
end

#next_page?Boolean

Returns:

  • (Boolean)


126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'lib/creepin/collection_creeper.rb', line 126

def next_page?
  return false if next_page_selector.nil?
  if next_page_selector.is_a?(Proc)
    next_page_url = instance_exec(@response_html.document, &next_page_selector)
    build_request_params(next_page_url) if next_page_url.present?
    @has_next_page = next_page_url.present?
  else
    next_page_url = @response_html.document.at_css(next_page_selector)
    build_request_params(next_page_url) if next_page_url.present?
    @has_next_page = next_page_url.present?
  end
  @has_next_page
end

#parse_responseObject



66
67
68
69
70
# File 'lib/creepin/collection_creeper.rb', line 66

def parse_response
  @response_html = Nokogiri::HTML::Document.parse(@response.body)
  load_response_collection
  map_response_collection if response_collection?
end

#resource_load_strategy?Boolean

Returns:

  • (Boolean)


100
101
102
# File 'lib/creepin/collection_creeper.rb', line 100

def resource_load_strategy?
  respond_to?(:resource_load_strategy)
end

#resource_save_strategy?Boolean

Returns:

  • (Boolean)


104
105
106
# File 'lib/creepin/collection_creeper.rb', line 104

def resource_save_strategy?
  respond_to?(:resource_save_strategy)
end

#response_collection?Boolean

Returns:

  • (Boolean)


76
77
78
# File 'lib/creepin/collection_creeper.rb', line 76

def response_collection?
  @response_collection.present?
end

#run_after_collection_loaded_callbacksObject



25
26
27
# File 'lib/creepin/collection_creeper.rb', line 25

def run_after_collection_loaded_callbacks
  after_collection_loaded_callbacks.each{ |callback| callback.call(self) } if after_collection_loaded_callbacks?
end

#run_after_crawl_callbacksObject



15
16
17
# File 'lib/creepin/collection_creeper.rb', line 15

def run_after_crawl_callbacks
  transmit
end

#run_after_crawl_finished_callbacksObject



19
20
21
22
23
# File 'lib/creepin/collection_creeper.rb', line 19

def run_after_crawl_finished_callbacks
  parse_response
  after_crawl_finished_callbacks.each{ |callback| callback.call(self) } if after_crawl_finished_callbacks?
  crawl_next
end

#save_resource(collected_attributes_hash, resource) ⇒ Object



88
89
90
91
92
93
94
# File 'lib/creepin/collection_creeper.rb', line 88

def save_resource(collected_attributes_hash, resource)
  if resource_save_strategy?
    resource_save_strategy.call(collected_attributes_hash, resource)
  else
    resource.save unless skip_resource_save?
  end
end

#skip_resource_save?Boolean

Returns:

  • (Boolean)


96
97
98
# File 'lib/creepin/collection_creeper.rb', line 96

def skip_resource_save?
  respond_to?(:skip_resource_save)
end

#transmitObject



49
50
51
52
53
54
55
# File 'lib/creepin/collection_creeper.rb', line 49

def transmit
  @request_params ||=  (default_params? ? {:query => default_params.merge(@params) } : {:query => @params } )
  @response = HTTParty.get(base_url, @request_params)
  @requested_urls << full_request_url(base_url, @request_params)
  @total_pages += 1
  crawl_finished
end