Class: RelatonOmg::Scraper

Inherits:

Object

Object
RelatonOmg::Scraper

show all

Defined in:: lib/relaton_omg/scraper.rb

Constant Summary collapse

URL_PATTERN =

"https://www.omg.org/spec/".freeze

Class Method Summary collapse

.scrape_page(ref) ⇒ Object

Instance Method Summary collapse

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ `Scraper`

Returns a new instance of Scraper.

# File 'lib/relaton_omg/scraper.rb', line 7

def initialize(acronym, version = nil, spec = nil)
  @acronym = acronym
  @version = version
  @spec = spec
end

Class Method Details

.scrape_page(ref) ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 13

def self.scrape_page(ref)
  %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref
  return unless acronym

  scraper = new(acronym, version, spec)
  doc = scraper.get_doc
  return if doc.nil? || scraper.fetch_link.empty?

  OmgBibliographicItem.new(**scraper.item)
end

Instance Method Details

#doc_version ⇒ `Object`



78
79
80

# File 'lib/relaton_omg/scraper.rb', line 78

def doc_version
  @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text
end

#fetch_abstract ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 69

def fetch_abstract
  content = @doc.at('//section[@id="document-metadata"]/div/div/p').text
  [{ content: content, language: "en", script: "Latn" }]
end

#fetch_date ⇒ `Object`



82
83
84

# File 'lib/relaton_omg/scraper.rb', line 82

def fetch_date
  [type: "published", on: pub_date.to_s]
end

#fetch_docid ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 62

def fetch_docid
  id = ["OMG", @acronym]
  id << doc_version if doc_version
  id << @spec if @spec
  [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)]
end

#fetch_id ⇒ `Object`



51
52
53

# File 'lib/relaton_omg/scraper.rb', line 51

def fetch_id
  "#{@acronym}#{doc_version}#{@spec}"
end

#fetch_keyword ⇒ `Object`



126
127
128

# File 'lib/relaton_omg/scraper.rb', line 126

def fetch_keyword
  @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text
end

#fetch_license ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 130

def fetch_license
  @doc.xpath(
    '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span',
  ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip }
end

#fetch_link ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 96

def fetch_link
  return @link if @link

  @links = []
  if @spec
    a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']")
    @links << { type: "src", content: a[:href] } if a
  else
    a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a')
    @links << { type: "src", content: a[:href] } if a
    pdf = @doc.at('//a[@class="download-document"]')
    @links << { type: "pdf", content: pdf[:href] } if pdf
  end
  @links
end

#fetch_relation ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 112

def fetch_relation
  v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr')
  v.reduce([]) do |mem, row|
    ver = row.at("td").text
    unless ver == doc_version
      acronym = row.at("td[3]/a")[:href].split("/")[4]
      fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}"
      bibitem = OmgBibliographicItem.new formattedref: fref
      mem << { type: "obsoletes", bibitem: bibitem }
    end
    mem
  end
end

#fetch_status ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 90

def fetch_status
  status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd')
  stage = status.text.strip.match(/\w+/).to_s
  RelatonBib::DocumentStatus.new(stage: stage)
end

#fetch_title ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 55

def fetch_title
  content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text
  content += ": #{@spec}" if @spec
  title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn"
  [RelatonBib::TypedTitleString.new(type: "main", title: title)]
end

#fetch_version ⇒ `Object`



74
75
76

# File 'lib/relaton_omg/scraper.rb', line 74

def fetch_version
  [RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)]
end

#get_doc ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 24

def get_doc
  @url = "#{URL_PATTERN}#{@acronym}/"
  @url += @version.gsub(' ', '/') if @version
  @doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10)
rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e
  return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404"

  raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})"
end

#item ⇒ `Object`

# File 'lib/relaton_omg/scraper.rb', line 34

def item
  {
    id: fetch_id,
    fetched: Date.today.to_s,
    docid: fetch_docid,
    title: fetch_title,
    abstract: fetch_abstract,
    version: fetch_version,
    date: fetch_date,
    docstatus: fetch_status,
    link: fetch_link,
    relation: fetch_relation,
    keyword: fetch_keyword,
    license: fetch_license,
  }
end

#pub_date ⇒ `Object`



86
87
88

# File 'lib/relaton_omg/scraper.rb', line 86

def pub_date
  Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip
end

Class: RelatonOmg::Scraper

Constant Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(acronym, version = nil, spec = nil) ⇒ Scraper

Class Method Details

.scrape_page(ref) ⇒ Object

Instance Method Details

#doc_version ⇒ Object

#fetch_abstract ⇒ Object

#fetch_date ⇒ Object

#fetch_docid ⇒ Object

#fetch_id ⇒ Object

#fetch_keyword ⇒ Object

#fetch_license ⇒ Object

#fetch_link ⇒ Object

#fetch_relation ⇒ Object

#fetch_status ⇒ Object

#fetch_title ⇒ Object

#fetch_version ⇒ Object

#get_doc ⇒ Object

#item ⇒ Object

#pub_date ⇒ Object

#initialize(acronym, version = nil, spec = nil) ⇒ `Scraper`

.scrape_page(ref) ⇒ `Object`

#doc_version ⇒ `Object`

#fetch_abstract ⇒ `Object`

#fetch_date ⇒ `Object`

#fetch_docid ⇒ `Object`

#fetch_id ⇒ `Object`

#fetch_keyword ⇒ `Object`

#fetch_license ⇒ `Object`

#fetch_link ⇒ `Object`

#fetch_relation ⇒ `Object`

#fetch_status ⇒ `Object`

#fetch_title ⇒ `Object`

#fetch_version ⇒ `Object`

#get_doc ⇒ `Object`

#item ⇒ `Object`

#pub_date ⇒ `Object`