Class: Html2rss::AutoSource::Scraper::Schema

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/html2rss/auto_source/scraper/schema.rb,
lib/html2rss/auto_source/scraper/schema/thing.rb,
lib/html2rss/auto_source/scraper/schema/item_list.rb,
lib/html2rss/auto_source/scraper/schema/list_item.rb,
lib/html2rss/auto_source/scraper/schema/category_extractor.rb

Overview

Returns a new instance of Schema.



84
85
86
87
88
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 84

def initialize(parsed_body, url:, **opts)
  @parsed_body = parsed_body
  @url = url
  @opts = opts
end

Class Method Details

.articles?(parsed_body) ⇒ Boolean

Returns:

  • (Boolean)


25
26
27
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 25

def articles?(parsed_body)
  parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
end

.from(object) ⇒ Array<Hash>

Returns a flat array of all supported schema objects by recursively traversing the given object.

:reek:DuplicateMethodCall

Parameters:

  • object (Hash, Array, Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)

    the schema_objects, or an empty array



42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 42

def from(object)
  case object
  when Nokogiri::XML::Element
    from(parse_script_tag(object))
  when Hash
    supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
  when Array
    object.flat_map { |item| from(item) }
  else
    []
  end
end

.options_keyObject



22
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 22

def self.options_key = :schema

.scraper_for_schema_object(schema_object) ⇒ Scraper::Schema::Thing, ...

Returns a class responding to #call.

Returns:



61
62
63
64
65
66
67
68
69
70
71
72
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 61

def scraper_for_schema_object(schema_object)
  type = schema_object[:@type]

  if Thing::SUPPORTED_TYPES.member?(type)
    Thing
  elsif ItemList::SUPPORTED_TYPES.member?(type)
    ItemList
  else
    Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
    nil
  end
end

.supported_schema_object?(object) ⇒ Boolean

Returns:

  • (Boolean)


55
56
57
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 55

def supported_schema_object?(object)
  scraper_for_schema_object(object) ? true : false
end

.supported_schema_type?(script) ⇒ Boolean

Returns:

  • (Boolean)


29
30
31
32
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 29

def supported_schema_type?(script)
  supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
  supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
end

Instance Method Details

#each {|Hash| ... } ⇒ Array<Hash>

Returns the scraped article_hashes.

Yields:

  • (Hash)

    Each scraped article_hash

Returns:

  • (Array<Hash>)

    the scraped article_hashes



93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 93

def each(&)
  return enum_for(:each) unless block_given?

  schema_objects.filter_map do |schema_object|
    next unless (klass = self.class.scraper_for_schema_object(schema_object))
    next unless (results = klass.new(schema_object, url:).call)

    if results.is_a?(Array)
      results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
    else
      yield(results)
    end
  end
end