Class: CrawlStation::Producer
- Inherits:
-
Object
- Object
- CrawlStation::Producer
- Includes:
- Celluloid
- Defined in:
- lib/crawl_station/producer.rb
Instance Attribute Summary collapse
-
#cache(item, data = 'parsing') ⇒ Object
Returns the value of attribute cache.
-
#proxies ⇒ Object
Returns the value of attribute proxies.
-
#proxy ⇒ Object
Returns the value of attribute proxy.
-
#schedule ⇒ Object
Returns the value of attribute schedule.
Instance Method Summary collapse
-
#initialize(schedule, cache, proxies = nil) ⇒ Producer
constructor
A new instance of Producer.
- #loop_parser ⇒ Object
- #parse_item(item) ⇒ Object
- #parse_links(data, namespace) ⇒ Object
- #parsed?(data) ⇒ Boolean
- #start ⇒ Object
Constructor Details
#initialize(schedule, cache, proxies = nil) ⇒ Producer
Returns a new instance of Producer.
7 8 9 10 11 |
# File 'lib/crawl_station/producer.rb', line 7 def initialize(schedule, cache, proxies = nil) @schedule = schedule @cache = cache @proxies = proxies end |
Instance Attribute Details
#cache(item, data = 'parsing') ⇒ Object
Returns the value of attribute cache.
5 6 7 |
# File 'lib/crawl_station/producer.rb', line 5 def cache @cache end |
#proxies ⇒ Object
Returns the value of attribute proxies.
5 6 7 |
# File 'lib/crawl_station/producer.rb', line 5 def proxies @proxies end |
#proxy ⇒ Object
Returns the value of attribute proxy.
5 6 7 |
# File 'lib/crawl_station/producer.rb', line 5 def proxy @proxy end |
#schedule ⇒ Object
Returns the value of attribute schedule.
5 6 7 |
# File 'lib/crawl_station/producer.rb', line 5 def schedule @schedule end |
Instance Method Details
#loop_parser ⇒ Object
18 19 20 21 22 23 24 25 26 27 28 29 30 |
# File 'lib/crawl_station/producer.rb', line 18 def loop_parser return sleep(0.2) || true if @schedule.empty? item = @schedule.pop item = CS::ParseStruct.new(item) if item.is_a?(Hash) return sleep(0.2) || true if parsed?(item) Logger.debug "start parse #{item.link}" data = parse_item(item) return true if data.nil? || data.empty? data = parse_links(data, item.namespace) return true if data.empty? item.item_class.new.save(item.link, data) true end |
#parse_item(item) ⇒ Object
32 33 34 35 36 37 38 39 40 |
# File 'lib/crawl_station/producer.rb', line 32 def parse_item(item) data = cache(item) { item.parser_class.new.crawl(item.link) } @schedule.done(item) data rescue Exception => e Logger.error("%s: %s\n%s" % [item.link, e., e.backtrace[0..10].join("\n")]) @schedule.failed(item) nil end |
#parse_links(data, namespace) ⇒ Object
42 43 44 45 46 47 48 49 50 51 |
# File 'lib/crawl_station/producer.rb', line 42 def parse_links(data, namespace) links = ->(data, namespace) do next if data['link'].blank? || parsed?(data) @schedule.push ParseStruct.new(parser: data['parser'], link: data['link'], namespace: namespace) end ['pages', 'details'].each do |field| data.delete(field)&.map { |page| links.call(page, namespace) } end data end |
#parsed?(data) ⇒ Boolean
53 54 55 |
# File 'lib/crawl_station/producer.rb', line 53 def parsed?(data) data.nil? || @cache.include?(data['link']) end |
#start ⇒ Object
13 14 15 16 |
# File 'lib/crawl_station/producer.rb', line 13 def start loop { break unless loop_parser } Logger.debug "#{self} done" end |