Class: Searcher::MultipleCrawler

Inherits:

Object

Object
Searcher::MultipleCrawler

show all

Defined in:: lib/searcher/spider.rb

Defined Under Namespace

Classes: Crawler

Instance Attribute Summary collapse

#redirect_limit ⇒ Object

Returns the value of attribute redirect_limit.
#user_agent ⇒ Object

Returns the value of attribute user_agent.

Instance Method Summary collapse

#init_beanstalk_jobs ⇒ Object
#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ MultipleCrawler constructor

A new instance of MultipleCrawler.
#process_jobs ⇒ Object
#read_results ⇒ Object
#run ⇒ Object

Constructor Details

#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ `MultipleCrawler`

Returns a new instance of MultipleCrawler.

# File 'lib/searcher/spider.rb', line 30

def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1)
  @websites = websites                # the url we ready to crawl
  @beanstalk_jobs = beanstalk_jobs    # beanstalk host port and so on
  @pm_max = pm_max                    # max process number
  @user_agent = user_agent
  @redirect_limit = redirect_limit
  @ipc_reader, @ipc_writer = IO.pipe
end

Instance Attribute Details

#redirect_limit ⇒ `Object`

Returns the value of attribute redirect_limit.



40
41
42

# File 'lib/searcher/spider.rb', line 40

def redirect_limit
  @redirect_limit
end

#user_agent ⇒ `Object`

Returns the value of attribute user_agent.



40
41
42

# File 'lib/searcher/spider.rb', line 40

def user_agent
  @user_agent
end

Instance Method Details

#init_beanstalk_jobs ⇒ `Object`

# File 'lib/searcher/spider.rb', line 42

def init_beanstalk_jobs
  beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
  #清空beanstalk的残留消息队列
  begin
    while job = beanstalk.reserve(0.1)
      job.delete
    end
  rescue Beanstalk::TimedOut
    print "Beanstalk queues cleared!\n"
  end
  @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
  beanstalk.close
    rescue => e
      puts e
      exit
end

#process_jobs ⇒ `Object`

# File 'lib/searcher/spider.rb', line 60

def process_jobs

  pm = Parallel::ForkManager.new(@pm_max)

  #pm.run_on_start do |pid,ident|
  #  print "** #{ident} started, pid: #{pid} and  size of results is #{results.size}\n"
  #end
  #
  #pm.run_on_finish {
  #    |pid,exit_code,ident|
  #  print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and  size of results is #{results.size}\n"
  #}

  @pm_max.times do |i|

    pm.start(i) and next
    beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
    @ipc_reader.close

    loop do
      begin
        job = beanstalk.reserve(0.1) # timeout 0.1s
        index = job.body
        job.delete
        website = @websites[index.to_i]
        result = Crawler.new.fetch(website)
        @ipc_writer.puts(result)
      rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
        break
      end
    end
    pm.finish(i)
  end

  @ipc_writer.close

  begin
    pm.wait_all_children
  rescue SystemExit, Interrupt
    print "Interrupt wait all children!\n"
  end

end

#read_results ⇒ `Object`

# File 'lib/searcher/spider.rb', line 104

def read_results
  results = []
  while result = @ipc_reader.gets
    results << result
  end
  results
end

#run ⇒ `Object`

# File 'lib/searcher/spider.rb', line 113

def run
  init_beanstalk_jobs
  process_jobs
  read_results
end

Class: Searcher::MultipleCrawler

Defined Under Namespace

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ MultipleCrawler

Instance Attribute Details

#redirect_limit ⇒ Object

#user_agent ⇒ Object

Instance Method Details

#init_beanstalk_jobs ⇒ Object

#process_jobs ⇒ Object

#read_results ⇒ Object

#run ⇒ Object

#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ `MultipleCrawler`

#redirect_limit ⇒ `Object`

#user_agent ⇒ `Object`

#init_beanstalk_jobs ⇒ `Object`

#process_jobs ⇒ `Object`

#read_results ⇒ `Object`

#run ⇒ `Object`