Class: Html2rss::AutoSource::Scraper::Schema::Thing

Inherits:
Object
  • Object
show all
Defined in:
lib/html2rss/auto_source/scraper/schema/thing.rb

Overview

A Thing is kind of the 'base class' for Schema.org schema_objects.

Direct Known Subclasses

ItemList, ListItem

Constant Summary collapse

SUPPORTED_TYPES =
%w[
  AdvertiserContentArticle
  AnalysisNewsArticle
  APIReference
  Article
  AskPublicNewsArticle
  BackgroundNewsArticle
  BlogPosting
  DiscussionForumPosting
  LiveBlogPosting
  NewsArticle
  OpinionNewsArticle
  Report
  ReportageNewsArticle
  ReviewNewsArticle
  SatiricalArticle
  ScholarlyArticle
  SocialMediaPosting
  TechArticle
].to_set.freeze
DEFAULT_ATTRIBUTES =
%i[id title description url image published_at categories].freeze

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(schema_object, url:) ⇒ Thing

Returns a new instance of Thing.



37
38
39
40
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 37

def initialize(schema_object, url:)
  @schema_object = schema_object
  @base_url = normalized_base_url(url)
end

Instance Attribute Details

#base_urlObject (readonly)

Returns the value of attribute base_url.



91
92
93
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 91

def base_url
  @base_url
end

#schema_objectObject (readonly)

Returns the value of attribute schema_object.



91
92
93
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 91

def schema_object
  @schema_object
end

Instance Method Details

#callHash

Returns the scraped article hash with DEFAULT_ATTRIBUTES.

Returns:

  • (Hash)

    the scraped article hash with DEFAULT_ATTRIBUTES



43
44
45
46
47
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 43

def call
  DEFAULT_ATTRIBUTES.to_h do |attribute|
    [attribute, public_send(attribute)]
  end
end

#categoriesObject



85
86
87
88
89
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 85

def categories
  return @categories if defined?(@categories)

  @categories = CategoryExtractor.call(schema_object)
end

#descriptionObject



61
62
63
64
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 61

def description
  schema_object.values_at(:description, :schema_object_body, :abstract)
               .max_by { |string| string.to_s.size }
end

#idObject



49
50
51
52
53
54
55
56
57
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 49

def id
  return @id if defined?(@id)

  id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s

  return if id.empty?

  @id = id
end

#imageObject



77
78
79
80
81
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 77

def image
  if (image_url = image_urls.first)
    Url.from_relative(image_url, base_url || image_url)
  end
end

#image_urlsObject



93
94
95
96
97
98
99
100
101
102
103
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 93

def image_urls
  schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
    next unless object

    if object.is_a?(String)
      object
    elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
      object[:url] || object[:contentUrl]
    end
  end
end

#normalized_base_url(url) ⇒ Object



133
134
135
136
137
138
139
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 133

def normalized_base_url(url)
  return if url.to_s.strip.empty?

  Url.from_absolute(url)
rescue ArgumentError
  nil
end

#normalized_id(value, reference_url:) ⇒ Object



105
106
107
108
109
110
111
112
113
114
115
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 105

def normalized_id(value, reference_url:)
  text = value.to_s
  return if text.empty?

  normalized_url = normalized_id_url(text, reference_url:)
  return text unless reference_url && normalized_url.host == reference_url.host

  normalized_id_value(normalized_url)
rescue ArgumentError
  text
end

#normalized_id_url(text, reference_url:) ⇒ Object



117
118
119
120
121
122
123
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 117

def normalized_id_url(text, reference_url:)
  if text.start_with?('/')
    Url.from_relative(text, reference_url || text)
  else
    Url.from_absolute(text)
  end
end

#normalized_id_value(url) ⇒ Object



125
126
127
128
129
130
131
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 125

def normalized_id_value(url)
  path = url.path.to_s
  return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
  return path unless path.empty?

  url.query
end

#published_atObject



83
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 83

def published_at = schema_object[:datePublished]

#titleObject



59
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 59

def title = schema_object[:title]

#urlHtml2rss::Url?

Returns the URL of the schema object.

Returns:



67
68
69
70
71
72
73
74
75
# File 'lib/html2rss/auto_source/scraper/schema/thing.rb', line 67

def url
  url = schema_object[:url]
  if url.to_s.empty?
    Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
    return
  end

  Url.from_relative(url, base_url || url)
end