Class: Html2rss::AutoSource::Scraper::Schema::Thing
- Inherits:
-
Object
- Object
- Html2rss::AutoSource::Scraper::Schema::Thing
- Defined in:
- lib/html2rss/auto_source/scraper/schema/thing.rb
Overview
A Thing is kind of the 'base class' for Schema.org schema_objects.
Constant Summary collapse
- SUPPORTED_TYPES =
%w[ AdvertiserContentArticle AnalysisNewsArticle APIReference Article AskPublicNewsArticle BackgroundNewsArticle BlogPosting DiscussionForumPosting LiveBlogPosting NewsArticle OpinionNewsArticle Report ReportageNewsArticle ReviewNewsArticle SatiricalArticle ScholarlyArticle SocialMediaPosting TechArticle ].to_set.freeze
- DEFAULT_ATTRIBUTES =
%i[id title description url image published_at categories].freeze
Instance Attribute Summary collapse
-
#base_url ⇒ Object
readonly
Returns the value of attribute base_url.
-
#schema_object ⇒ Object
readonly
Returns the value of attribute schema_object.
Instance Method Summary collapse
-
#call ⇒ Hash
The scraped article hash with DEFAULT_ATTRIBUTES.
- #categories ⇒ Object
- #description ⇒ Object
- #id ⇒ Object
- #image ⇒ Object
- #image_urls ⇒ Object
-
#initialize(schema_object, url:) ⇒ Thing
constructor
A new instance of Thing.
- #normalized_base_url(url) ⇒ Object
- #normalized_id(value, reference_url:) ⇒ Object
- #normalized_id_url(text, reference_url:) ⇒ Object
- #normalized_id_value(url) ⇒ Object
- #published_at ⇒ Object
- #title ⇒ Object
-
#url ⇒ Html2rss::Url?
The URL of the schema object.
Constructor Details
#initialize(schema_object, url:) ⇒ Thing
Returns a new instance of Thing.
37 38 39 40 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 37 def initialize(schema_object, url:) @schema_object = schema_object @base_url = normalized_base_url(url) end |
Instance Attribute Details
#base_url ⇒ Object (readonly)
Returns the value of attribute base_url.
91 92 93 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 91 def base_url @base_url end |
#schema_object ⇒ Object (readonly)
Returns the value of attribute schema_object.
91 92 93 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 91 def schema_object @schema_object end |
Instance Method Details
#call ⇒ Hash
Returns the scraped article hash with DEFAULT_ATTRIBUTES.
43 44 45 46 47 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 43 def call DEFAULT_ATTRIBUTES.to_h do |attribute| [attribute, public_send(attribute)] end end |
#categories ⇒ Object
85 86 87 88 89 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 85 def categories return @categories if defined?(@categories) @categories = CategoryExtractor.call(schema_object) end |
#description ⇒ Object
61 62 63 64 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 61 def description schema_object.values_at(:description, :schema_object_body, :abstract) .max_by { |string| string.to_s.size } end |
#id ⇒ Object
49 50 51 52 53 54 55 56 57 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 49 def id return @id if defined?(@id) id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s return if id.empty? @id = id end |
#image ⇒ Object
77 78 79 80 81 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 77 def image if (image_url = image_urls.first) Url.from_relative(image_url, base_url || image_url) end end |
#image_urls ⇒ Object
93 94 95 96 97 98 99 100 101 102 103 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 93 def image_urls schema_object.values_at(:image, :thumbnailUrl).filter_map do |object| next unless object if object.is_a?(String) object elsif object.is_a?(Hash) && object[:@type] == 'ImageObject' object[:url] || object[:contentUrl] end end end |
#normalized_base_url(url) ⇒ Object
133 134 135 136 137 138 139 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 133 def normalized_base_url(url) return if url.to_s.strip.empty? Url.from_absolute(url) rescue ArgumentError nil end |
#normalized_id(value, reference_url:) ⇒ Object
105 106 107 108 109 110 111 112 113 114 115 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 105 def normalized_id(value, reference_url:) text = value.to_s return if text.empty? normalized_url = normalized_id_url(text, reference_url:) return text unless reference_url && normalized_url.host == reference_url.host normalized_id_value(normalized_url) rescue ArgumentError text end |
#normalized_id_url(text, reference_url:) ⇒ Object
117 118 119 120 121 122 123 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 117 def normalized_id_url(text, reference_url:) if text.start_with?('/') Url.from_relative(text, reference_url || text) else Url.from_absolute(text) end end |
#normalized_id_value(url) ⇒ Object
125 126 127 128 129 130 131 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 125 def normalized_id_value(url) path = url.path.to_s return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty? return path unless path.empty? url.query end |
#published_at ⇒ Object
83 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 83 def published_at = schema_object[:datePublished] |
#title ⇒ Object
59 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 59 def title = schema_object[:title] |
#url ⇒ Html2rss::Url?
Returns the URL of the schema object.
67 68 69 70 71 72 73 74 75 |
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 67 def url url = schema_object[:url] if url.to_s.empty? Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}") return end Url.from_relative(url, base_url || url) end |