Module: Spidey::Strategies::Mongo
- Defined in:
- lib/spidey/strategies/mongo.rb
Instance Attribute Summary collapse
-
#error_collection ⇒ Object
Returns the value of attribute error_collection.
-
#result_collection ⇒ Object
Returns the value of attribute result_collection.
-
#url_collection ⇒ Object
Returns the value of attribute url_collection.
Instance Method Summary collapse
- #add_error(attrs) ⇒ Object
- #crawl(options = {}) ⇒ Object
- #each_url(&_block) ⇒ Object
- #handle(url, handler, default_data = {}) ⇒ Object
- #initialize(attrs = {}) ⇒ Object
- #record(data) ⇒ Object
Instance Attribute Details
#error_collection ⇒ Object
Returns the value of attribute error_collection.
3 4 5 |
# File 'lib/spidey/strategies/mongo.rb', line 3 def error_collection @error_collection end |
#result_collection ⇒ Object
Returns the value of attribute result_collection.
3 4 5 |
# File 'lib/spidey/strategies/mongo.rb', line 3 def result_collection @result_collection end |
#url_collection ⇒ Object
Returns the value of attribute url_collection.
3 4 5 |
# File 'lib/spidey/strategies/mongo.rb', line 3 def url_collection @url_collection end |
Instance Method Details
#add_error(attrs) ⇒ Object
45 46 47 48 49 50 |
# File 'lib/spidey/strategies/mongo.rb', line 45 def add_error(attrs) error = attrs.delete(:error) doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error., spider: self.class.name) error_collection.insert doc Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.}" end |
#crawl(options = {}) ⇒ Object
12 13 14 15 16 |
# File 'lib/spidey/strategies/mongo.rb', line 12 def crawl( = {}) @crawl_started_at = Time.now @until = Time.now + [:crawl_for] if [:crawl_for] super end |
#each_url(&_block) ⇒ Object
37 38 39 40 41 42 43 |
# File 'lib/spidey/strategies/mongo.rb', line 37 def each_url(&_block) while url = get_next_url break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch url_collection.update({ '_id' => url['_id'] }, '$set' => { last_crawled_at: Time.now }) yield url['url'], url['handler'], url['default_data'].symbolize_keys end end |
#handle(url, handler, default_data = {}) ⇒ Object
18 19 20 21 22 23 24 25 |
# File 'lib/spidey/strategies/mongo.rb', line 18 def handle(url, handler, default_data = {}) Spidey.logger.info "Queueing #{url.inspect[0..200]}..." url_collection.update( { 'spider' => self.class.name, 'url' => url }, { '$set' => { 'handler' => handler, 'default_data' => default_data } }, upsert: true ) end |
#initialize(attrs = {}) ⇒ Object
5 6 7 8 9 10 |
# File 'lib/spidey/strategies/mongo.rb', line 5 def initialize(attrs = {}) self.url_collection = attrs.delete(:url_collection) self.result_collection = attrs.delete(:result_collection) self.error_collection = attrs.delete(:error_collection) super attrs end |
#record(data) ⇒ Object
27 28 29 30 31 32 33 34 35 |
# File 'lib/spidey/strategies/mongo.rb', line 27 def record(data) doc = data.merge('spider' => self.class.name) Spidey.logger.info "Recording #{doc.inspect[0..500]}..." if respond_to?(:result_key) && key = result_key(doc) result_collection.update({ 'key' => key }, { '$set' => doc }, upsert: true) else result_collection.insert doc end end |