Class: CrawlStation::Producer

Inherits:
Object
  • Object
show all
Includes:
Celluloid
Defined in:
lib/crawl_station/producer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schedule, cache, proxies = nil) ⇒ Producer

Returns a new instance of Producer.



7
8
9
10
11
# File 'lib/crawl_station/producer.rb', line 7

def initialize(schedule, cache, proxies = nil)
  @schedule = schedule
  @cache = cache
  @proxies = proxies
end

Instance Attribute Details

#cache(item, data = 'parsing') ⇒ Object

Returns the value of attribute cache.



5
6
7
# File 'lib/crawl_station/producer.rb', line 5

def cache
  @cache
end

#proxiesObject

Returns the value of attribute proxies.



5
6
7
# File 'lib/crawl_station/producer.rb', line 5

def proxies
  @proxies
end

#proxyObject

Returns the value of attribute proxy.



5
6
7
# File 'lib/crawl_station/producer.rb', line 5

def proxy
  @proxy
end

#scheduleObject

Returns the value of attribute schedule.



5
6
7
# File 'lib/crawl_station/producer.rb', line 5

def schedule
  @schedule
end

Instance Method Details

#loop_parserObject



18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/crawl_station/producer.rb', line 18

def loop_parser
  return sleep(0.2) || true if @schedule.empty?
  item = @schedule.pop
  item = CS::ParseStruct.new(item) if item.is_a?(Hash)
  return sleep(0.2) || true if parsed?(item)
  Logger.debug "start parse #{item.link}"
  data = parse_item(item)
  return true if data.nil? || data.empty?
  data = parse_links(data, item.namespace)
  return true if data.empty?
  item.item_class.new.save(item.link, data)
  true
end

#parse_item(item) ⇒ Object



32
33
34
35
36
37
38
39
40
# File 'lib/crawl_station/producer.rb', line 32

def parse_item(item)
  data = cache(item) { item.parser_class.new.crawl(item.link) }
  @schedule.done(item)
  data
rescue Exception => e
  Logger.error("%s: %s\n%s" % [item.link, e.message, e.backtrace[0..10].join("\n")])
  @schedule.failed(item)
  nil
end


42
43
44
45
46
47
48
49
50
51
# File 'lib/crawl_station/producer.rb', line 42

def parse_links(data, namespace)
  links = ->(data, namespace) do
    next if data['link'].blank? || parsed?(data)
    @schedule.push ParseStruct.new(parser: data['parser'], link: data['link'], namespace: namespace)
  end
  ['pages', 'details'].each do |field|
    data.delete(field)&.map { |page| links.call(page, namespace) }
  end
  data
end

#parsed?(data) ⇒ Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/crawl_station/producer.rb', line 53

def parsed?(data)
  data.nil? || @cache.include?(data['link'])
end

#startObject



13
14
15
16
# File 'lib/crawl_station/producer.rb', line 13

def start
  loop { break unless loop_parser }
  Logger.debug "#{self} done"
end