Class: DhEasy::Core::Mock::FakeDb

Inherits:
Object
  • Object
show all
Defined in:
lib/dh_easy/core/mock/fake_db.rb

Overview

Fake in memory database that emulates ‘DataHen` database objects’ black box behavior.

Constant Summary collapse

PAGE_KEYS =

Page id keys, analog to primary keys.

['gid'].freeze
OUTPUT_KEYS =

Output id keys, analog to primary keys.

['_id', '_collection'].freeze
JOB_KEYS =

Job id keys, analog to primary keys.

['job_id'].freeze
JOB_STATUSES =

Job available status.

{
  active: 'active',
  done: 'done',
  cancelled: 'cancelled',
  paused: 'paused'
}
DEFAULT_COLLECTION =

Default collection for saved outputs

'default'
DEFAULT_FETCH_TYPE =

Default page’s fetch type

'standard'
DEFAULT_UUID_ALGORITHM =

Default uuid algorithm

:md5
VALID_UUID_ALGORITHMS =

Valid uuid algorithms

[:md5, :sha1, :sha256]

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(opts = {}) ⇒ FakeDb

Initialize fake database.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options.

Options Hash (opts):

  • :job_id (Integer, nil)

    Job id default value.

  • :scraper_name (String, nil)

    Scraper name default value.

  • :page_gid (String, nil)

    Page gid default value.

  • :allow_page_gid_override (Boolean, nil) — default: false

    Specify whenever page gid can be overrided on page or output insert.

  • :allow_job_id_override (Boolean, nil) — default: false

    Specify whenever job id can be overrided on page or output insert.

  • :uuid_algorithm (Enumerator, nil) — default: :md5

    Specify the algorithm to be used to generate UUID values.



382
383
384
385
386
387
388
389
# File 'lib/dh_easy/core/mock/fake_db.rb', line 382

def initialize opts = {}
  self.job_id = opts[:job_id]
  self.scraper_name = opts[:scraper_name]
  self.page_gid = opts[:page_gid]
  self.uuid_algorithm = opts[:uuid_algorithm]
  @allow_page_gid_override = opts[:allow_page_gid_override].nil? ? false : !!opts[:allow_page_gid_override]
  @allow_job_id_override = opts[:allow_job_id_override].nil? ? false : !!opts[:allow_job_id_override]
end

Class Method Details

.build_fake_job(opts = {}) ⇒ Hash

Build a fake job by using FakeDb engine.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Options Hash (opts):

  • :scraper_name (String) — default: nil

    Scraper name.

  • :job_id (Integer) — default: nil

    Job id.

  • :status (String) — default: 'done'

    .

Returns:

  • (Hash)


245
246
247
248
249
250
251
252
# File 'lib/dh_easy/core/mock/fake_db.rb', line 245

def self.build_fake_job opts = {}
  job = {
    'job_id' => opts[:job_id],
    'scraper_name' => opts[:scraper_name],
    'status' => (opts[:status] || 'done')
  }
  build_job job, opts
end

.build_fake_page(opts = {}) ⇒ Hash

Build a fake page by using FakeDb engine.

Parameters:

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Options Hash (opts):

  • :url (String) — default: 'https://example.com'

    Page url.

Returns:

  • (Hash)


90
91
92
93
94
95
# File 'lib/dh_easy/core/mock/fake_db.rb', line 90

def self.build_fake_page opts = {}
  page = {
    'url' => (opts[:url] || 'https://example.com')
  }
  build_page page, opts
end

.build_job(job, opts = {}) ⇒ Hash

Build a job with defaults by using FakeDb engine.

Parameters:

  • job (Hash)

    Job initial values.

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Returns:

  • (Hash)


231
232
233
234
235
# File 'lib/dh_easy/core/mock/fake_db.rb', line 231

def self.build_job job, opts = {}
  temp_db = DhEasy::Core::Mock::FakeDb.new opts
  temp_db.jobs << job
  temp_db.jobs.last
end

.build_page(page, opts = {}) ⇒ Hash

Build a page with defaults by using FakeDb engine.

Parameters:

  • page (Hash)

    Page initial values.

  • opts (Hash) (defaults to: {})

    ({}) Configuration options (see #initialize).

Returns:

  • (Hash)


74
75
76
77
78
79
80
81
82
# File 'lib/dh_easy/core/mock/fake_db.rb', line 74

def self.build_page page, opts = {}
  opts = {
    allow_page_gid_override: true,
    allow_job_id_override: true
  }.merge opts
  temp_db = DhEasy::Core::Mock::FakeDb.new opts
  temp_db.pages << page
  temp_db.pages.first
end

.clean_uri(raw_url) ⇒ String

Clean an URL to remove fragment, lowercase schema and host, and sort

query string.

Parameters:

  • raw_url (String)

    URL to clean.

Returns:

  • (String)


130
131
132
# File 'lib/dh_easy/core/mock/fake_db.rb', line 130

def self.clean_uri raw_url
  clean_uri_obj(raw_url).to_s
end

.clean_uri_obj(raw_url) ⇒ URI::HTTPS

Clean an URL to remove fragment, lowercase schema and host, and sort

query string.

Parameters:

  • raw_url (String)

    URL to clean.

Returns:

  • (URI::HTTPS)


103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/dh_easy/core/mock/fake_db.rb', line 103

def self.clean_uri_obj raw_url
  url = URI.parse(raw_url)
  return url if raw_url =~ /^\s*about:blank\s*$/i
  url.hostname = url.hostname.downcase unless url.hostname.nil?
  url.fragment = nil

  # Sort query string keys
  unless url.query.nil?
    query_string = CGI.parse(url.query)
    keys = query_string.keys.sort
    data = []
    keys.each do |key|
      query_string[key].each do |value|
        data << "#{URI.encode key}=#{URI.encode value}"
      end
    end
    url.query = data.join('&')
  end
  url
end

.fake_uuid(seed = nil, algorithm = nil) ⇒ String

Generate a fake UUID.

Parameters:

  • seed (nil) (defaults to: nil)

    Object to use as seed for uuid.

  • algorithm (Enumerator) (defaults to: nil)

    (nil) Algorithm to use: sha256 (default), sha1, md5.

Returns:

  • (String)


44
45
46
47
48
49
50
51
52
53
54
55
# File 'lib/dh_easy/core/mock/fake_db.rb', line 44

def self.fake_uuid seed = nil, algorithm = nil
  seed ||= (Time.new.to_f + rand)
  algorithm ||= DEFAULT_UUID_ALGORITHM
  case algorithm
  when :sha256
    Digest::SHA256.hexdigest seed.to_s
  when :sha1
    Digest::SHA1.hexdigest seed.to_s
  else
    Digest::MD5.hexdigest seed.to_s
  end
end

.new_collection(keys, opts = {}) ⇒ DhEasy::Core::SmartCollection

Generate a smart collection with keys and initial values.

Parameters:

  • keys (Array)

    Analog to primary keys, combination will be uniq.

  • opts (Hash) (defaults to: {})

    Configuration options (see DhEasy::Core::SmartCollection#initialize).

Returns:



34
35
36
# File 'lib/dh_easy/core/mock/fake_db.rb', line 34

def self.new_collection keys, opts = {}
  DhEasy::Core::SmartCollection.new keys, opts
end

.output_uuid(data, uuid_algorithm = nil) ⇒ String

Generate a fake UUID based on output fields without ‘_` prefix.

Parameters:

  • data (Hash)

    Output data.

  • uuid_algorithm (Enumerator) (defaults to: nil)

    (nil) Algorithm to use: sha256 (default), sha1, md5.

Returns:

  • (String)


63
64
65
66
# File 'lib/dh_easy/core/mock/fake_db.rb', line 63

def self.output_uuid data, uuid_algorithm = nil
  seed = data.select{|k,v|k.to_s =~ /^[^_]/}.hash
  fake_uuid seed, uuid_algorithm
end

.time_stamp(time = nil) ⇒ String

Return a timestamp

Parameters:

  • time (Time) (defaults to: nil)

    (nil) Time from which to get time stamp.

Returns:

  • (String)


259
260
261
262
# File 'lib/dh_easy/core/mock/fake_db.rb', line 259

def self.time_stamp time = nil
  time = Time.new if time.nil?
  time.utc.strftime('%FT%T.%6N').gsub(/[0.]+\Z/,'') << "Z"
end

Instance Method Details

#allow_job_id_override?Boolean

Specify whenever job id overriding by user is allowed on page or

output insert.

Returns:

  • (Boolean)

    ‘true` when allowed, else `false`.



366
367
368
# File 'lib/dh_easy/core/mock/fake_db.rb', line 366

def allow_job_id_override?
  @allow_job_id_override ||= false
end

#allow_page_gid_override?Boolean

Specify whenever page gid overriding by user is allowed on page or

output insert.

Returns:

  • (Boolean)

    ‘true` when allowed, else `false`.



348
349
350
# File 'lib/dh_easy/core/mock/fake_db.rb', line 348

def allow_page_gid_override?
  @allow_page_gid_override ||= false
end

#disable_job_id_overrideObject

Disable job id override on page or output insert.



358
359
360
# File 'lib/dh_easy/core/mock/fake_db.rb', line 358

def disable_job_id_override
  @allow_job_id_override = false
end

#disable_page_gid_overrideObject

Disable page gid override on page or output insert.



340
341
342
# File 'lib/dh_easy/core/mock/fake_db.rb', line 340

def disable_page_gid_override
  @allow_page_gid_override = false
end

#enable_job_id_overrideObject

Enable job id override on page or output insert.



353
354
355
# File 'lib/dh_easy/core/mock/fake_db.rb', line 353

def enable_job_id_override
  @allow_job_id_override = true
end

#enable_page_gid_overrideObject

Enable page gid override on page or output insert.



335
336
337
# File 'lib/dh_easy/core/mock/fake_db.rb', line 335

def enable_page_gid_override
  @allow_page_gid_override = true
end

#ensure_job(target_job_id = nil) ⇒ Hash

Get current job or create new one from values.

Parameters:

  • target_job_id (Integer) (defaults to: nil)

    (nil) Job id to ensure existance.

Returns:

  • (Hash)


269
270
271
272
273
274
275
276
277
278
279
280
# File 'lib/dh_easy/core/mock/fake_db.rb', line 269

def ensure_job target_job_id = nil
  target_job_id = job_id if target_job_id.nil?
  job = jobs.find{|v|v['job_id'] == target_job_id}
  return job unless job.nil?
  job = {
    'job_id' => target_job_id,
    'scraper_name' => scraper_name,
  }
  job['status'] = 'active' unless target_job_id != job_id
  jobs << job
  jobs.last
end

#fake_uuid(seed = nil) ⇒ String

Generate a fake UUID using the configured uuid algorithm.

Parameters:

  • seed (nil) (defaults to: nil)

    Object to use as seed for uuid.

Returns:

  • (String)


396
397
398
# File 'lib/dh_easy/core/mock/fake_db.rb', line 396

def fake_uuid seed = nil
  self.class.fake_uuid seed, self.uuid_algorithm
end

#generate_job_idInteger

Generate a fake job_id.

Returns:

  • (Integer)


410
411
412
# File 'lib/dh_easy/core/mock/fake_db.rb', line 410

def generate_job_id
  jobs.count < 1 ? 1 : (jobs.max{|a,b|a['job_id'] <=> b['job_id']}['job_id'] + 1)
end

#generate_output_id(data) ⇒ String

Generate a fake UUID for outputs.

Parameters:

  • data (Hash)

    Output data.

Returns:

  • (String)


634
635
636
637
# File 'lib/dh_easy/core/mock/fake_db.rb', line 634

def generate_output_id data
  # Generate random UUID to match Datahen behavior
  self.fake_uuid
end

#generate_page_gid(page_data) ⇒ String

Generate a fake UUID based on page data:

* url
* method
* headers
* fetch_type
* cookie
* no_redirect
* body
* ua_type

Parameters:

  • page_data (Hash)

    Page data.

Returns:

  • (String)


457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
# File 'lib/dh_easy/core/mock/fake_db.rb', line 457

def generate_page_gid page_data
  # ensure page url
  return "" if page_data['url'].nil? || page_data['url'].to_s.strip === ''

  # calculate extra fields, keep field order to match datahen
  data = []
  data << "method:#{page_data['method'].to_s.downcase}"
  no_url_encode = (!page_data['no_url_encode'].nil? && !!page_data['no_url_encode'])
  uri = self.class.clean_uri_obj(page_data['url'])
  url = (no_url_encode ? page_data['url'].to_s.lstrip : uri.to_s)
  data << "url:#{url}"
  headers = self.class.format_headers page_data['headers']
  data << "headers:#{headers}"
  data << "body:#{page_data['body'].to_s}"
  no_redirect = (!page_data['no_redirect'].nil? && !!page_data['no_redirect'])
  data << "no_redirect:#{no_redirect.to_s}"
  ua_type = (page_data['ua_type'].to_s === '') ? 'desktop' : page_data['ua_type']
  data << "ua_type:#{ua_type}"

  # complex fields
  data << "fetch_type:#{page_data['fetch_type']}" unless self.class.is_default_fetch_type? page_data['fetch_type']
  # keep this cookie logic to match datahen
  data << "cookie:#{page_data['cookie'].split(/;\s*/).sort.join(';')}" if page_data['cookie'].to_s.strip != ''
  data << "http2:true" if page_data.has_key?('http2') && !page_data['http2'].nil? && !!page_data['http2']
  data << "driverName:#{page_data['driver']['name']}" unless self.class.is_driver_empty? page_data['driver']
  unless self.class.is_display_empty? page_data['display']
    data << "display:#{page_data['display']['width']}x#{page_data['display']['height']}"
  end
  unless self.class.is_screenshot_empty? page_data['screenshot']
    checksum = self.fake_uuid JSON.generate(page_data['screenshot'])
    data << "screenshot:#{checksum}"
  end

  # generate GID
  seed = data.join('|')
  checksum = self.fake_uuid seed
  "#{uri.hostname}-#{checksum}"
end

#generate_scraper_nameString

Generate a fake scraper name.

Returns:

  • (String)


403
404
405
# File 'lib/dh_easy/core/mock/fake_db.rb', line 403

def generate_scraper_name
  Faker::Internet.unique.slug
end

#job_idInteger?

Fake job id.

Returns:

  • (Integer, nil)


297
298
299
# File 'lib/dh_easy/core/mock/fake_db.rb', line 297

def job_id
  @job_id ||= generate_job_id
end

#job_id=(value) ⇒ Object

Set fake job id value.



302
303
304
305
306
# File 'lib/dh_easy/core/mock/fake_db.rb', line 302

def job_id= value
  @job_id = value
  ensure_job
  job_id
end

#jobsDhEasy::Core::SmartCollection

Stored job collection



430
431
432
433
434
435
436
437
438
439
440
441
442
# File 'lib/dh_easy/core/mock/fake_db.rb', line 430

def jobs
  return @jobs unless @jobs.nil?
  collection = self.class.new_collection JOB_KEYS,
    defaults: job_defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    DhEasy::Core.deep_stringify_keys raw_item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['job_id'] ||= generate_job_id
    item
  end
  @jobs ||= collection
end

#outputsDhEasy::Core::SmartCollection

Stored output collection



655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
# File 'lib/dh_easy/core/mock/fake_db.rb', line 655

def outputs
  return @outputs unless @outputs.nil?
  collection = self.class.new_collection OUTPUT_KEYS,
    defaults: output_defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    item = DhEasy::Core.deep_stringify_keys raw_item
    item.delete '_job_id' unless allow_job_id_override?
    item.delete '_gid_id' unless allow_page_gid_override?
    item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['_id'] ||= generate_output_id item
    item
  end
  collection.bind_event(:after_insert) do |collection, item|
    ensure_job item['_job_id']
  end
  @outputs ||= collection
end

#page_gidInteger?

Current fake page gid.

Returns:

  • (Integer, nil)


310
311
312
# File 'lib/dh_easy/core/mock/fake_db.rb', line 310

def page_gid
  @page_gid ||= self.fake_uuid
end

#page_gid=(value) ⇒ Object

Set current fake page gid value.



315
316
317
# File 'lib/dh_easy/core/mock/fake_db.rb', line 315

def page_gid= value
  @page_gid = value
end

#pagesDhEasy::Core::SmartCollection

Note:

Page gid will be replaced on insert by an auto generated uuid unless page gid overriding is enabled (see #allow_page_gid_override?)

Stored page collection.



584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
# File 'lib/dh_easy/core/mock/fake_db.rb', line 584

def pages
  return @pages unless @page.nil?

  defaults = self.page_defaults
  collection = self.class.new_collection PAGE_KEYS,
    defaults: defaults
  collection.bind_event(:before_defaults) do |collection, raw_item|
    item = DhEasy::Core.deep_stringify_keys raw_item
    if !item['driver'].nil? && item['driver'].is_a?(Hash)
      item['driver'] = defaults['driver'].merge item['driver']
    end
    if !item['display'].nil? && item['display'].is_a?(Hash)
      item['display'] = defaults['display'].merge item['display']
    end
    if !item['screenshot'].nil? && item['screenshot'].is_a?(Hash)
      item['screenshot'] = defaults['screenshot'].merge item['screenshot']
    end
    item.delete 'job_id' unless allow_job_id_override?
    item
  end
  collection.bind_event(:before_insert) do |collection, item, match|
    item['driver'] = nil if self.class.is_driver_empty? item['driver']
    item['display'] = nil if self.class.is_display_empty? item['display']
    item['screenshot'] = nil if self.class.is_screenshot_empty? item['screenshot']
    item['headers'] = nil if self.class.is_hash_empty? item['headers']
    item['vars'] = nil if self.class.is_hash_empty? item['vars']
    uri = self.class.clean_uri_obj(item['url'])
    item['hostname'] = (item['url'] =~ /^about:blank$/i) ? '127.0.0.1' : uri.hostname
    uri = nil
    if item['gid'].nil? || !allow_page_gid_override?
      item['gid'] = generate_page_gid item
    end

    # 30 days = 60 * 60 * 24 * 30 = 2592000
    item['freshness'] ||= self.class.time_stamp (Time.now - 2592000)
    item['to_fetch'] ||= self.class.time_stamp
    item['created_at'] ||= self.class.time_stamp
    item
  end
  collection.bind_event(:after_insert) do |collection, item|
    ensure_job item['job_id']
  end
  @pages ||= collection
end

#query(collection, filter, offset = 0, limit = nil) ⇒ Object

Note:

Warning: It uses table scan to filter and should be used on test suites only.

Search items from a collection.

Parameters:

  • collection (Symbol)

    Allowed values: ‘:outputs`, `:pages`.

  • filter (Hash)

    Filters to query.

  • offset (Integer) (defaults to: 0)

    (0) Search results offset.

  • limit (Integer, nil) (defaults to: nil)

    (nil) Limit search results count. Set to ‘nil` for unlimited.

Raises:

  • ArgumentError On unknown collection.



702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
# File 'lib/dh_easy/core/mock/fake_db.rb', line 702

def query collection, filter, offset = 0, limit = nil
  return [] unless limit.nil? || limit > 0

  # Get collection items
  items = case collection
  when :outputs
    outputs
  when :pages
    pages
  when :jobs
    jobs
  else
    raise ArgumentError.new "Unknown collection #{collection}."
  end

  # Search items
  count = 0
  matches = []
  items.each do |item|
    next unless match? item, filter
    count += 1

    # Skip until offset
    next unless offset < count
    # Break on limit reach
    break unless limit.nil? || matches.count < limit
    matches << item
  end
  matches
end

#refetch(job_id, gid) ⇒ Object

Refetch a page.

Parameters:

  • job_id (Integer)

    Page’s job_id to refetch.

  • gid (String)

    Page’s gid to refetch.

Raises:



737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
# File 'lib/dh_easy/core/mock/fake_db.rb', line 737

def refetch job_id, gid
  page = pages.find_match('gid' => gid, 'job_id' => job_id)
  raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
  page['status'] = 'to_fetch'
  page['freshness'] = self.class.time_stamp
  page['to_fetch'] = self.class.time_stamp
  page['fetched_from'] = nil
  page['fetching_at'] = '2001-01-01T00:00:00Z'
  page['fetched_at'] = nil
  page['fetching_try_count'] = 0
  page['effective_url'] = nil
  page['parsing_at'] = nil
  page['parsing_failed_at'] = nil
  page['parsed_at'] = nil
  page['parsing_try_count'] = 0
  page['parsing_fail_count'] = 0
  page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
  page['response_checksum'] = nil
  page['response_status'] = nil
  page['response_status_code'] = nil
  page['response_headers'] = nil
  page['response_cookie'] = nil
  page['response_proto'] = nil
  page['content_type'] = nil
  page['content_size'] = 0
  page['failed_response_status_code'] = nil
  page['failed_response_headers'] = nil
  page['failed_response_cookie'] = nil
  page['failed_effective_url'] = nil
  page['failed_at'] = nil
  page['failed_content_type'] = nil
end

#reparse(job_id, gid) ⇒ Object

Reparse a page.

Parameters:

  • job_id (Integer)

    Page’s job_id to reparse.

  • gid (String)

    Page’s gid to reparse.

Raises:



774
775
776
777
778
779
780
781
782
783
784
# File 'lib/dh_easy/core/mock/fake_db.rb', line 774

def reparse job_id, gid
  page = pages.find_match('gid' => gid, 'job_id' => job_id)
  raise Exception.new("Page not found with job_id \"#{job_id}\" gid \"#{gid}\"") if page.nil?
  page['status'] = 'to_parse'
  page['parsing_at'] = nil
  page['parsing_failed_at'] = nil
  page['parsing_updated_at'] = '2001-01-01T00:00:00Z'
  page['parsed_at'] = nil
  page['parsing_try_count'] = 0
  page['parsing_fail_count'] = 0
end

#scraper_nameString?

Fake scraper_name.

Returns:

  • (String, nil)


284
285
286
# File 'lib/dh_easy/core/mock/fake_db.rb', line 284

def scraper_name
  @scraper_name ||= 'my_scraper'
end

#scraper_name=(value) ⇒ Object

Set fake scraper_name value.



289
290
291
292
293
# File 'lib/dh_easy/core/mock/fake_db.rb', line 289

def scraper_name= value
  job = ensure_job
  @scraper_name = value
  job['scraper_name'] = scraper_name
end

#uuid_algorithmEnumerator?

Current UUID algorithm.

Returns:

  • (Enumerator, nil)


321
322
323
# File 'lib/dh_easy/core/mock/fake_db.rb', line 321

def uuid_algorithm
  @uuid_algorithm ||= DEFAULT_UUID_ALGORITHM
end

#uuid_algorithm=(value) ⇒ Object

Set current UUID algorithm value.

Raises:

  • (ArgumentError)

    Whenever an invalid algorithm is provided



327
328
329
330
331
332
# File 'lib/dh_easy/core/mock/fake_db.rb', line 327

def uuid_algorithm= value
  unless value.nil? || VALID_UUID_ALGORITHMS.include?(value)
    raise ArgumentError.new("Invalid UUID algorithm, valid values are :md5, :sha1, :sha256")
  end
  @uuid_algorithm = value
end