Class: RelatonOmg::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_omg/scraper.rb

Constant Summary collapse

URL_PATTERN =
"https://www.omg.org/spec/".freeze

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ Scraper

Returns a new instance of Scraper.



7
8
9
10
11
# File 'lib/relaton_omg/scraper.rb', line 7

def initialize(acronym, version = nil, spec = nil)
  @acronym = acronym
  @version = version
  @spec = spec
end

Class Method Details

.scrape_page(ref) ⇒ Object



13
14
15
16
17
18
19
20
21
22
# File 'lib/relaton_omg/scraper.rb', line 13

def self.scrape_page(ref)
  %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
  return unless acronym

  scraper = new(acronym, version, spec)
  doc = scraper.get_doc
  return if doc.nil? || scraper.fetch_link.empty?

  OmgBibliographicItem.new(**scraper.item)
end

Instance Method Details

#doc_versionObject



78
79
80
# File 'lib/relaton_omg/scraper.rb', line 78

def doc_version
  @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
end

#fetch_abstractObject



69
70
71
72
# File 'lib/relaton_omg/scraper.rb', line 69

def fetch_abstract
  content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
  [{ content: content, language: "en", script: "Latn" }]
end

#fetch_dateObject



82
83
84
# File 'lib/relaton_omg/scraper.rb', line 82

def fetch_date
  [type: "published", on: pub_date.to_s]
end

#fetch_docidObject



62
63
64
65
66
67
# File 'lib/relaton_omg/scraper.rb', line 62

def fetch_docid
  id = ["OMG", @acronym]
  id << doc_version if doc_version
  id << @spec if @spec
  [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
end

#fetch_idObject



51
52
53
# File 'lib/relaton_omg/scraper.rb', line 51

def fetch_id
  "#{@acronym}#{doc_version}#{@spec}"
end

#fetch_keywordObject



126
127
128
# File 'lib/relaton_omg/scraper.rb', line 126

def fetch_keyword
  @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
end

#fetch_licenseObject



130
131
132
133
134
# File 'lib/relaton_omg/scraper.rb', line 130

def fetch_license
  @doc.xpath(
    '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
  ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
end


96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/relaton_omg/scraper.rb', line 96

def fetch_link
  return @link if @link

  @links = []
  if @spec
    a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
    @links << { type: "src", content: a[:href] } if a
  else
    a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
    @links << { type: "src", content: a[:href] } if a
    pdf = @doc.at('//a[@class="download-document"]')
    @links << { type: "pdf", content: pdf[:href] } if pdf
  end
  @links
end

#fetch_relationObject



112
113
114
115
116
117
118
119
120
121
122
123
124
# File 'lib/relaton_omg/scraper.rb', line 112

def fetch_relation
  v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
  v.reduce([]) do |mem, row|
    ver = row.at("td").text
    unless ver == doc_version
      acronym = row.at("td[3]/a")[:href].split("/")[4]
      fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
      bibitem = OmgBibliographicItem.new formattedref: fref
      mem << { type: "obsoletes", bibitem: bibitem }
    end
    mem
  end
end

#fetch_statusObject



90
91
92
93
94
# File 'lib/relaton_omg/scraper.rb', line 90

def fetch_status
  status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
  stage = status.text.strip.match(/\w+/).to_s
  RelatonBib::DocumentStatus.new(stage: stage)
end

#fetch_titleObject



55
56
57
58
59
60
# File 'lib/relaton_omg/scraper.rb', line 55

def fetch_title
  content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
  content += ": #{@spec}" if @spec
  title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
  [RelatonBib::TypedTitleString.new(type: "main", title: title)]
end

#fetch_versionObject



74
75
76
# File 'lib/relaton_omg/scraper.rb', line 74

def fetch_version
  [RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
end

#get_docObject



24
25
26
27
28
29
30
31
32
# File 'lib/relaton_omg/scraper.rb', line 24

def get_doc
  @url = "#{URL_PATTERN}#{@acronym}/"
  @url += @version.gsub(' ', '/') if @version
  @doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
  return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"

  raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
end

#itemObject



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/relaton_omg/scraper.rb', line 34

def item
  {
    id: fetch_id,
    fetched: Date.today.to_s,
    docid: fetch_docid,
    title: fetch_title,
    abstract: fetch_abstract,
    version: fetch_version,
    date: fetch_date,
    docstatus: fetch_status,
    link: fetch_link,
    relation: fetch_relation,
    keyword: fetch_keyword,
    license: fetch_license,
  }
end

#pub_dateObject



86
87
88
# File 'lib/relaton_omg/scraper.rb', line 86

def pub_date
  Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
end