Class: Html2rss::AutoSource::Scraper::Schema
- Inherits:
-
Object
- Object
- Html2rss::AutoSource::Scraper::Schema
- Includes:
- Enumerable
- Defined in:
- lib/html2rss/auto_source/scraper/schema.rb,
lib/html2rss/auto_source/scraper/schema/thing.rb,
lib/html2rss/auto_source/scraper/schema/item_list.rb,
lib/html2rss/auto_source/scraper/schema/list_item.rb,
lib/html2rss/auto_source/scraper/schema/category_extractor.rb
Overview
Returns a new instance of Schema.
84 85 86 87 88 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 84 def initialize(parsed_body, url:, **opts) @parsed_body = parsed_body @url = url @opts = opts end |
Class Method Details
.articles?(parsed_body) ⇒ Boolean
25 26 27 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 25 def articles?(parsed_body) parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) } end |
.from(object) ⇒ Array<Hash>
Returns a flat array
of all supported schema objects
by recursively traversing the given object.
:reek:DuplicateMethodCall
42 43 44 45 46 47 48 49 50 51 52 53 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 42 def from(object) case object when Nokogiri::XML::Element from(parse_script_tag(object)) when Hash supported_schema_object?(object) ? [object] : object.values.flat_map { |item| from(item) } when Array object.flat_map { |item| from(item) } else [] end end |
.options_key ⇒ Object
22 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 22 def self. = :schema |
.scraper_for_schema_object(schema_object) ⇒ Scraper::Schema::Thing, ...
Returns a class responding to #call.
61 62 63 64 65 66 67 68 69 70 71 72 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 61 def scraper_for_schema_object(schema_object) type = schema_object[:@type] if Thing::SUPPORTED_TYPES.member?(type) Thing elsif ItemList::SUPPORTED_TYPES.member?(type) ItemList else Log.debug("#{name}: unsupported schema object @type=#{type.inspect}") nil end end |
.supported_schema_object?(object) ⇒ Boolean
55 56 57 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 55 def supported_schema_object?(object) scraper_for_schema_object(object) ? true : false end |
.supported_schema_type?(script) ⇒ Boolean
29 30 31 32 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 29 def supported_schema_type?(script) supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) } end |
Instance Method Details
#each {|Hash| ... } ⇒ Array<Hash>
Returns the scraped article_hashes.
93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/html2rss/auto_source/scraper/schema.rb', line 93 def each(&) return enum_for(:each) unless block_given? schema_objects.filter_map do |schema_object| next unless (klass = self.class.scraper_for_schema_object(schema_object)) next unless (results = klass.new(schema_object, url:).call) if results.is_a?(Array) results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument else yield(results) end end end |