Class: Searcher::MultipleCrawler

Inherits:
Object
  • Object
show all
Defined in:
lib/searcher/spider.rb

Defined Under Namespace

Classes: Crawler

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(websites, beanstalk_jobs = Global::Beanstalk_jobs, pm_max = 10, user_agent = Global::UserAgent, redirect_limit = 1) ⇒ MultipleCrawler

Returns a new instance of MultipleCrawler.



30
31
32
33
34
35
36
37
# File 'lib/searcher/spider.rb', line 30

def initialize(websites, beanstalk_jobs=Global::Beanstalk_jobs, pm_max=10, user_agent=Global::UserAgent, redirect_limit=1)
  @websites = websites                # the url we ready to crawl
  @beanstalk_jobs = beanstalk_jobs    # beanstalk host port and so on
  @pm_max = pm_max                    # max process number
  @user_agent = user_agent
  @redirect_limit = redirect_limit
  @ipc_reader, @ipc_writer = IO.pipe
end

Instance Attribute Details

#redirect_limitObject

Returns the value of attribute redirect_limit.



40
41
42
# File 'lib/searcher/spider.rb', line 40

def redirect_limit
  @redirect_limit
end

#user_agentObject

Returns the value of attribute user_agent.



40
41
42
# File 'lib/searcher/spider.rb', line 40

def user_agent
  @user_agent
end

Instance Method Details

#init_beanstalk_jobsObject



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/searcher/spider.rb', line 42

def init_beanstalk_jobs
  beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
  #清空beanstalk的残留消息队列
  begin
    while job = beanstalk.reserve(0.1)
      job.delete
    end
  rescue Beanstalk::TimedOut
    print "Beanstalk queues cleared!\n"
  end
  @websites.size.times{|i| beanstalk.put(i)} # 将所有的任务压栈
  beanstalk.close
    rescue => e
      puts e
      exit
end

#process_jobsObject



60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/searcher/spider.rb', line 60

def process_jobs

  pm = Parallel::ForkManager.new(@pm_max)

  #pm.run_on_start do |pid,ident|
  #  print "** #{ident} started, pid: #{pid} and  size of results is #{results.size}\n"
  #end
  #
  #pm.run_on_finish {
  #    |pid,exit_code,ident|
  #  print "** #{ident} just got out of the pool with PID #{pid} and exit code: #{exit_code} and  size of results is #{results.size}\n"
  #}

  @pm_max.times do |i|

    pm.start(i) and next
    beanstalk = Beanstalk::Pool.new(*@beanstalk_jobs)
    @ipc_reader.close

    loop do
      begin
        job = beanstalk.reserve(0.1) # timeout 0.1s
        index = job.body
        job.delete
        website = @websites[index.to_i]
        result = Crawler.new.fetch(website)
        @ipc_writer.puts(result)
      rescue Beanstalk::DeadlineSoonError, Beanstalk::TimedOut, SystemExit, Interrupt
        break
      end
    end
    pm.finish(i)
  end

  @ipc_writer.close

  begin
    pm.wait_all_children
  rescue SystemExit, Interrupt
    print "Interrupt wait all children!\n"
  end

end

#read_resultsObject



104
105
106
107
108
109
110
# File 'lib/searcher/spider.rb', line 104

def read_results
  results = []
  while result = @ipc_reader.gets
    results << result
  end
  results
end

#runObject



113
114
115
116
117
# File 'lib/searcher/spider.rb', line 113

def run
  init_beanstalk_jobs
  process_jobs
  read_results
end