Class: Html2rss::AutoSource::Scraper::Schema

Inherits:
Object
  • Object
show all
Includes:
Enumerable
Defined in:
lib/html2rss/auto_source/scraper/schema.rb,
lib/html2rss/auto_source/scraper/schema/thing.rb,
lib/html2rss/auto_source/scraper/schema/item_list.rb,
lib/html2rss/auto_source/scraper/schema/list_item.rb
more...

Overview

Defined Under Namespace

Classes: ItemList, ListItem, Thing

Constant Summary collapse

TAG_SELECTOR =
'script[type="application/ld+json"]'

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(parsed_body, url:) ⇒ Schema

Returns a new instance of Schema.

[View source]

81
82
83
84
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 81

def initialize(parsed_body, url:)
  @parsed_body = parsed_body
  @url = url
end

Class Method Details

.articles?(parsed_body) ⇒ Boolean

Returns:

  • (Boolean)
[View source]

23
24
25
26
27
28
29
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 23

def articles?(parsed_body)
  parsed_body.css(TAG_SELECTOR).any? do |script|
    (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
      script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
    end
  end
end

.from(object) ⇒ Array<Hash>

Returns a flat array of all supported schema objects by recursively traversing the given ‘object`.

:reek:DuplicateMethodCall

Parameters:

  • object (Hash, Array, Nokogiri::XML::Element)

Returns:

  • (Array<Hash>)

    the schema_objects, or an empty array

[View source]

39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 39

def from(object)
  case object
  when Nokogiri::XML::Element
    from(parse_script_tag(object))
  when Hash
    supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) }
  when Array
    object.flat_map { |item| from(item) }
  else
    []
  end
end

.scraper_for_schema_object(schema_object) ⇒ Scraper::Schema::Thing, ...

Returns a class responding to ‘#call`.

Returns:

[View source]

58
59
60
61
62
63
64
65
66
67
68
69
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 58

def scraper_for_schema_object(schema_object)
  type = schema_object[:@type]

  if Thing::SUPPORTED_TYPES.member?(type)
    Thing
  elsif ItemList::SUPPORTED_TYPES.member?(type)
    ItemList
  else
    Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
    nil
  end
end

.supported_schema_object?(object) ⇒ Boolean

Returns:

  • (Boolean)
[View source]

52
53
54
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 52

def supported_schema_object?(object)
  scraper_for_schema_object(object) ? true : false
end

Instance Method Details

#each {|Hash| ... } ⇒ Array<Hash>

Returns the scraped article_hashes.

Yields:

  • (Hash)

    Each scraped article_hash

Returns:

  • (Array<Hash>)

    the scraped article_hashes

[View source]

89
90
91
92
93
94
95
96
97
98
99
100
101
102
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 89

def each(&)
  return enum_for(:each) unless block_given?

  schema_objects.filter_map do |schema_object|
    next unless (klass = self.class.scraper_for_schema_object(schema_object))
    next unless (results = klass.new(schema_object, url:).call)

    if results.is_a?(Array)
      results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
    else
      yield(results)
    end
  end
end