Module: Spidey::Strategies::Mongo

Defined in:
lib/spidey/strategies/mongo.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#error_collectionObject

Returns the value of attribute error_collection.



3
4
5
# File 'lib/spidey/strategies/mongo.rb', line 3

def error_collection
  @error_collection
end

#result_collectionObject

Returns the value of attribute result_collection.



3
4
5
# File 'lib/spidey/strategies/mongo.rb', line 3

def result_collection
  @result_collection
end

#url_collectionObject

Returns the value of attribute url_collection.



3
4
5
# File 'lib/spidey/strategies/mongo.rb', line 3

def url_collection
  @url_collection
end

Instance Method Details

#add_error(attrs) ⇒ Object



45
46
47
48
49
50
# File 'lib/spidey/strategies/mongo.rb', line 45

def add_error(attrs)
  error = attrs.delete(:error)
  doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error.message, spider: self.class.name)
  error_collection.insert doc
  Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.message}"
end

#crawl(options = {}) ⇒ Object



12
13
14
15
16
# File 'lib/spidey/strategies/mongo.rb', line 12

def crawl(options = {})
  @crawl_started_at = Time.now
  @until = Time.now + options[:crawl_for] if options[:crawl_for]
  super options
end

#each_url(&_block) ⇒ Object



37
38
39
40
41
42
43
# File 'lib/spidey/strategies/mongo.rb', line 37

def each_url(&_block)
  while url = get_next_url
    break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
    url_collection.update({ '_id' => url['_id'] }, '$set' => { last_crawled_at: Time.now })
    yield url['url'], url['handler'], url['default_data'].symbolize_keys
  end
end

#handle(url, handler, default_data = {}) ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/spidey/strategies/mongo.rb', line 18

def handle(url, handler, default_data = {})
  Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
  url_collection.update(
    { 'spider' => self.class.name, 'url' => url },
    { '$set' => { 'handler' => handler, 'default_data' => default_data } },
    upsert: true
  )
end

#initialize(attrs = {}) ⇒ Object



5
6
7
8
9
10
# File 'lib/spidey/strategies/mongo.rb', line 5

def initialize(attrs = {})
  self.url_collection = attrs.delete(:url_collection)
  self.result_collection = attrs.delete(:result_collection)
  self.error_collection = attrs.delete(:error_collection)
  super attrs
end

#record(data) ⇒ Object



27
28
29
30
31
32
33
34
35
# File 'lib/spidey/strategies/mongo.rb', line 27

def record(data)
  doc = data.merge('spider' => self.class.name)
  Spidey.logger.info "Recording #{doc.inspect[0..500]}..."
  if respond_to?(:result_key) && key = result_key(doc)
    result_collection.update({ 'key' => key }, { '$set' => doc }, upsert: true)
  else
    result_collection.insert doc
  end
end