Class: Pioneer::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/pioneer/base.rb

Direct Known Subclasses

Crawler

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ Base

Returns a new instance of Base.

Raises:



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/pioneer/base.rb', line 16

def initialize(opts = {})
  raise UndefinedLocations, "you should specify `locations` method in your `self.class`" unless self.methods.include? :locations
  raise UndefinedProcessing, "you should specify `processing` method in your `self.class`" unless self.methods.include? :processing
  # raise LocationsNotEnumerator, "you should specify `locations` to return Enumerator" unless self.locations.methods.include? :each
  @name          = opts[:name]          || "crawler"
  @concurrency   = opts[:concurrency]   || 10
  @sleep         = opts[:sleep]         || 0 # sleep is reversed RPS (1/RPS) - frequency of requests.
  @log_enabled   = opts[:log_enabled]   || true # Logger is enabled by default
  @log_level     = opts[:log_level]     || Logger::DEBUG
  @random_header = opts[:random_header] || false
  @header        = opts[:header]        || nil
  @redirects     = opts[:redirects]     || nil
  @headers       = opts[:headers]      #|| nil
  @request_opts  = opts[:request_opts] #|| nil
end

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(method_name, *args, &block) ⇒ Object

we should override only our methods: locations, processing, if_XXX



125
126
127
128
129
130
131
132
133
# File 'lib/pioneer/base.rb', line 125

def method_missing(method_name, *args, &block)
  case method_name
  when /locations.*=|processing.*=|if_.+=/
    method_name = method_name.to_s.gsub("=", "").to_sym
    override_method(method_name, args.first)
  else
    super(method_name, *args, &block)
  end
end

Instance Attribute Details

#concurrencyObject (readonly)

Returns the value of attribute concurrency.



14
15
16
# File 'lib/pioneer/base.rb', line 14

def concurrency
  @concurrency
end

#log_levelObject (readonly)

Returns the value of attribute log_level.



14
15
16
# File 'lib/pioneer/base.rb', line 14

def log_level
  @log_level
end

#nameObject (readonly)

Returns the value of attribute name.



14
15
16
# File 'lib/pioneer/base.rb', line 14

def name
  @name
end

#redirectObject (readonly)

Returns the value of attribute redirect.



14
15
16
# File 'lib/pioneer/base.rb', line 14

def redirect
  @redirect
end

#request_optsObject (readonly)

EmHttpRequest options



97
98
99
# File 'lib/pioneer/base.rb', line 97

def request_opts
  @request_opts
end

#sleepObject (readonly)

Sleep if the last request was recently (less then timout period)



61
62
63
# File 'lib/pioneer/base.rb', line 61

def sleep
  @sleep
end

Instance Method Details

#headersObject

Headers callback



118
119
120
# File 'lib/pioneer/base.rb', line 118

def headers
  @headers
end

#http_optsObject

Set headers, such as redirects, cookies etc



86
87
88
89
90
91
92
# File 'lib/pioneer/base.rb', line 86

def http_opts
  opts = {}
  opts[:head] = random_header if @random_header
  opts[:head] = @header if @header
  opts[:redirects] = @redirects if @redirects
  opts
end

#loggerObject

Default Pioneer logger



75
76
77
78
79
80
81
# File 'lib/pioneer/base.rb', line 75

def logger
  @logger ||= begin
    logger = Logger.new(STDOUT)
    logger.level = log_level
    logger
  end
end

#override_method(method_name, arg) ⇒ Object

Overriding methods as singeltons so they are availible only for current instance of crawler



138
139
140
141
142
143
144
145
146
147
148
# File 'lib/pioneer/base.rb', line 138

def override_method(method_name, arg)
  if Proc === arg
    self.define_singleton_method method_name do |req|
      arg.call(req)
    end
  else
    self.define_singleton_method method_name do
      arg
    end
  end
end

#random_headerObject

Generate random header for request



111
112
113
# File 'lib/pioneer/base.rb', line 111

def random_header
  HttpHeader.random
end

#startObject

Main method: starting crawling through locations If we catch Pioneer::HttpRetryRequest then we are retrying request And if we catch Pioneer::HttpSkipRequest we just return nothing?



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/pioneer/base.rb', line 37

def start
  result = []
  EM.synchrony do
    EM::Synchrony::FiberIterator.new(locations, concurrency).map do |url|
      counter = 0
      begin
        sleep
        result << Request.new(url, self, counter).perform
      rescue Pioneer::HttpRetryRequest => e
        # return to our loop
        counter += 1
        retry
      rescue Pioneer::HttpSkipRequest => e
        nil # do nothing?
      end
    end
    EM.stop
  end
  result
end