Class: Searcher::MultipleCrawler
- Inherits:
-
Object
- Object
- Searcher::MultipleCrawler
- Defined in:
- lib/searcher/spider.rb
Defined Under Namespace
Classes: Crawler
Instance Attribute Summary collapse
-
#redirect_limit ⇒ Object
Returns the value of attribute redirect_limit.
-
#user_agent ⇒ Object
Returns the value of attribute user_agent.
Instance Method Summary collapse
- #init_beanstalk_jobs ⇒ Object
-
#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ MultipleCrawler
constructor
A new instance of MultipleCrawler.
- #process_jobs ⇒ Object
- #read_results ⇒ Object
- #run ⇒ Object
Constructor Details
#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ MultipleCrawler
Returns a new instance of MultipleCrawler.
30 31 32 33 34 35 36 37 |
# File 'lib/searcher/spider.rb', line 30 def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1) @websites = websites # the url we ready to crawl @beanstalk_jobs = beanstalk_jobs # beanstalk host port and so on @pm_max = pm_max # max process number @user_agent = user_agent @redirect_limit = redirect_limit @ipc_reader, @ipc_writer = IO.pipe end |
Instance Attribute Details
#redirect_limit ⇒ Object
Returns the value of attribute redirect_limit.
40 41 42 |
# File 'lib/searcher/spider.rb', line 40 def redirect_limit @redirect_limit end |
#user_agent ⇒ Object
Returns the value of attribute user_agent.
40 41 42 |
# File 'lib/searcher/spider.rb', line 40 def user_agent @user_agent end |
Instance Method Details
#init_beanstalk_jobs ⇒ Object
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# File 'lib/searcher/spider.rb', line 42 def init_beanstalk_jobs beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) #清空beanstalk的残留消息队列 begin while job = beanstalk.reserve(0.1) job.delete end rescue Beanstalk::TimedOut print "Beanstalk queues cleared!\n" end @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈 beanstalk.close rescue => e puts e exit end |
#process_jobs ⇒ Object
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# File 'lib/searcher/spider.rb', line 60 def process_jobs pm = Parallel::ForkManager.new(@pm_max) #pm.run_on_start do |pid,ident| # print "** #{ident} started, pid: #{pid} and size of results is #{results.size}\n" #end # #pm.run_on_finish { # |pid,exit_code,ident| # print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and size of results is #{results.size}\n" #} @pm_max.times do |i| pm.start(i) and next beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs) @ipc_reader.close loop do begin job = beanstalk.reserve(0.1) # timeout 0.1s index = job.body job.delete website = @websites[index.to_i] result = Crawler.new.fetch(website) @ipc_writer.puts(result) rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt break end end pm.finish(i) end @ipc_writer.close begin pm.wait_all_children rescue SystemExit, Interrupt print "Interrupt wait all children!\n" end end |
#read_results ⇒ Object
104 105 106 107 108 109 110 |
# File 'lib/searcher/spider.rb', line 104 def read_results results = [] while result = @ipc_reader.gets results << result end results end |
#run ⇒ Object
113 114 115 116 117 |
# File 'lib/searcher/spider.rb', line 113 def run init_beanstalk_jobs process_jobs read_results end |