Class: Kimurai::Base

Inherits:
Object
  • Object
show all
Includes:
BaseHelper
Defined in:
lib/kimurai/base.rb,
lib/kimurai/base/saver.rb,
lib/kimurai/base/storage.rb

Direct Known Subclasses

ApplicationSpider

Defined Under Namespace

Classes: InvalidUrlError, Saver, Storage

Constant Summary collapse

DMERGE_EXCLUDE =

don’t deep merge config’s headers hash option

[:headers].freeze
LoggerFormatter =
proc do |severity, datetime, progname, msg|
  current_thread_id = Thread.current.object_id
  thread_type = Thread.main == Thread.current ? 'M' : 'C'
  output = format("%s, [%s#%d] [%s: %s] %5s -- %s: %s\n", severity[0..0], datetime, $PROCESS_ID, thread_type,
                  current_thread_id, severity, progname, msg)

  if Kimurai.configuration.colorize_logger != false && Kimurai.env == 'development'
    Rbcat.colorize(output, predefined: i[jsonhash logger])
  else
    output
  end
end

Class Attribute Summary collapse

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Methods included from BaseHelper

#extract

Constructor Details

#initialize(engine = self.class.engine, config: {}) ⇒ Base

Returns a new instance of Base.



189
190
191
192
193
194
195
196
197
198
199
200
201
# File 'lib/kimurai/base.rb', line 189

def initialize(engine = self.class.engine, config: {})
  @engine = engine || self.class.engine
  @config = self.class.config.deep_merge_excl(config, DMERGE_EXCLUDE)
  @pipelines = self.class.pipelines.map do |pipeline_name|
    klass = Pipeline.descendants.find { |kl| kl.name == pipeline_name }
    instance = klass.new
    instance.spider = self
    [pipeline_name, instance]
  end.to_h

  @logger = self.class.logger
  @savers = {}
end

Class Attribute Details

.run_infoObject (readonly)

Returns the value of attribute run_info.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def run_info
  @run_info
end

.saversObject (readonly)

Returns the value of attribute savers.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def savers
  @savers
end

.storageObject (readonly)

Returns the value of attribute storage.



30
31
32
# File 'lib/kimurai/base.rb', line 30

def storage
  @storage
end

Instance Attribute Details

#loggerObject (readonly)

Returns the value of attribute logger.



186
187
188
# File 'lib/kimurai/base.rb', line 186

def logger
  @logger
end

#with_infoObject

Returns the value of attribute with_info.



187
188
189
# File 'lib/kimurai/base.rb', line 187

def with_info
  @with_info
end

Class Method Details

.add_event(scope, event) ⇒ Object



59
60
61
62
63
# File 'lib/kimurai/base.rb', line 59

def self.add_event(scope, event)
  return unless @run_info

  @update_mutex.synchronize { @run_info[:events][scope][event] += 1 }
end

.completed?Boolean

Returns:

  • (Boolean)


37
38
39
# File 'lib/kimurai/base.rb', line 37

def self.completed?
  @run_info && @run_info[:status] == :completed
end

.configObject



91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/kimurai/base.rb', line 91

def self.config
  base_config = if superclass.equal?(::Object)
                  @config
                else
                  superclass.config.deep_merge_excl(@config || {}, DMERGE_EXCLUDE)
                end

  # Merge @delay shortcut into config if set
  if delay
    base_config.deep_merge_excl({ before_request: { delay: delay } }, DMERGE_EXCLUDE)
  else
    base_config
  end
end

.crawl!(exception_on_fail: true) ⇒ Object



116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/kimurai/base.rb', line 116

def self.crawl!(exception_on_fail: true)
  logger.error "Spider: already running: #{name}" and return false if running?

  @storage = Storage.new
  @savers = {}
  @update_mutex = Mutex.new

  @run_info = {
    spider_name: name, status: :running, error: nil, environment: Kimurai.env,
    start_time: Time.new, stop_time: nil, running_time: nil,
    visits: { requests: 0, responses: 0 }, items: { sent: 0, processed: 0 },
    events: { requests_errors: Hash.new(0), drop_items_errors: Hash.new(0), custom: Hash.new(0) }
  }

  ###

  logger.info "Spider: started: #{name}"
  open_spider if respond_to? :open_spider

  spider = new
  spider.with_info = true
  if start_urls
    start_urls.each do |start_url|
      if start_url.instance_of?(Hash)
        spider.request_to(:parse, start_url)
      else
        spider.request_to(:parse, url: start_url)
      end
    end
  else
    spider.parse
  end
rescue StandardError, SignalException, SystemExit => e
  @run_info.merge!(status: :failed, error: e.inspect)
  exception_on_fail ? raise(e) : [@run_info, e]
else
  @run_info.merge!(status: :completed)
ensure
  if spider
    spider.browser.destroy_driver! if spider.instance_variable_get('@browser')

    stop_time  = Time.now
    total_time = (stop_time - @run_info[:start_time]).round(3)
    @run_info.merge!(stop_time: stop_time, running_time: total_time)

    close_spider if respond_to? :close_spider

    message = "Spider: stopped: #{@run_info.merge(running_time: @run_info[:running_time]&.duration)}"
    failed? ? logger.fatal(message) : logger.info(message)

    @run_info, @storage, @savers, @update_mutex = nil
  end
end

.delayObject



87
88
89
# File 'lib/kimurai/base.rb', line 87

def self.delay
  @delay ||= superclass.respond_to?(:delay) ? superclass.delay : nil
end

.engineObject



75
76
77
# File 'lib/kimurai/base.rb', line 75

def self.engine
  @engine ||= superclass.engine
end

.failed?Boolean

Returns:

  • (Boolean)


41
42
43
# File 'lib/kimurai/base.rb', line 41

def self.failed?
  @run_info && @run_info[:status] == :failed
end

.itemsObject



49
50
51
# File 'lib/kimurai/base.rb', line 49

def self.items
  @run_info && @run_info[:items]
end

.loggerObject



108
109
110
111
112
113
114
# File 'lib/kimurai/base.rb', line 108

def self.logger
  @logger ||= Kimurai.configuration.logger || begin
    log_level = (ENV['LOG_LEVEL'] || Kimurai.configuration.log_level || 'DEBUG').to_s.upcase
    log_level = "Logger::#{log_level}".constantize
    Logger.new($stdout, formatter: LoggerFormatter, level: log_level, progname: name)
  end
end

.nameObject



71
72
73
# File 'lib/kimurai/base.rb', line 71

def self.name
  @name || to_s.underscore
end

.parse!(handler, *args, **request) ⇒ Object



170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/kimurai/base.rb', line 170

def self.parse!(handler, *args, **request)
  spider = new

  if args.present?
    spider.public_send(handler, *args)
  elsif request.present?
    spider.request_to(handler, request)
  else
    spider.public_send(handler)
  end
ensure
  spider.browser.destroy_driver! if spider.instance_variable_get('@browser')
end

.pipelinesObject



79
80
81
# File 'lib/kimurai/base.rb', line 79

def self.pipelines
  @pipelines ||= superclass.pipelines
end

.running?Boolean

Returns:

  • (Boolean)


33
34
35
# File 'lib/kimurai/base.rb', line 33

def self.running?
  @run_info && @run_info[:status] == :running
end

.start_urlsObject



83
84
85
# File 'lib/kimurai/base.rb', line 83

def self.start_urls
  @start_urls
end

.update(type, subtype) ⇒ Object



53
54
55
56
57
# File 'lib/kimurai/base.rb', line 53

def self.update(type, subtype)
  return unless @run_info

  @update_mutex.synchronize { @run_info[type][subtype] += 1 }
end

.visitsObject



45
46
47
# File 'lib/kimurai/base.rb', line 45

def self.visits
  @run_info && @run_info[:visits]
end

Instance Method Details

#add_event(scope = :custom, event) ⇒ Object



252
253
254
255
256
# File 'lib/kimurai/base.rb', line 252

def add_event(scope = :custom, event)
  self.class.add_event(scope, event) if with_info

  logger.info "Spider: new event (scope: #{scope}): #{event}" if scope == :custom
end

#browserObject



203
204
205
# File 'lib/kimurai/base.rb', line 203

def browser
  @browser ||= BrowserBuilder.build(@engine, @config, spider: self)
end

#console(response = nil, url: nil, data: {}) ⇒ Object



221
222
223
# File 'lib/kimurai/base.rb', line 221

def console(response = nil, url: nil, data: {})
  binding.pry
end

#request_to(handler, delay = nil, url:, data: {}, response_type: :html) ⇒ Object

Raises:



207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/kimurai/base.rb', line 207

def request_to(handler, delay = nil, url:, data: {}, response_type: :html)
  raise InvalidUrlError, "Requested url is invalid: #{url}" unless URI.parse(url).is_a?(URI::HTTP)

  if @config[:skip_duplicate_requests] && !unique_request?(url)
    add_event(:duplicate_requests) if with_info
    logger.warn "Spider: request_to: not unique url: #{url}, skipped" and return
  end

  visited = delay ? browser.visit(url, delay: delay) : browser.visit(url)
  return unless visited

  public_send(handler, browser.current_response(response_type), **{ url: url, data: data })
end

#save_to(path, item, format:, position: true, append: false) ⇒ Object



237
238
239
240
241
242
243
244
245
246
247
248
# File 'lib/kimurai/base.rb', line 237

def save_to(path, item, format:, position: true, append: false)
  @savers[path] ||= begin
    options = { format: format, position: position, append: append }
    if with_info
      self.class.savers[path] ||= Saver.new(path, **options)
    else
      Saver.new(path, **options)
    end
  end

  @savers[path].save(item)
end

#storageObject



227
228
229
230
231
# File 'lib/kimurai/base.rb', line 227

def storage
  # NOTE: for `.crawl!` uses shared thread safe Storage instance,
  # otherwise, each spider instance will have it's own Storage
  @storage ||= with_info ? self.class.storage : Storage.new
end

#unique?(scope, value) ⇒ Boolean

Returns:

  • (Boolean)


233
234
235
# File 'lib/kimurai/base.rb', line 233

def unique?(scope, value)
  storage.unique?(scope, value)
end