Class: RelatonOmg::Scraper
- Inherits:
-
Object
- Object
- RelatonOmg::Scraper
- Defined in:
- lib/relaton_omg/scraper.rb
Constant Summary collapse
- URL_PATTERN =
"https://www.omg.org/spec/".freeze
Class Method Summary collapse
Instance Method Summary collapse
- #doc_version ⇒ Object
- #fetch_abstract ⇒ Object
- #fetch_date ⇒ Object
- #fetch_docid ⇒ Object
- #fetch_id ⇒ Object
- #fetch_keyword ⇒ Object
- #fetch_license ⇒ Object
- #fetch_link ⇒ Object
- #fetch_relation ⇒ Object
- #fetch_status ⇒ Object
- #fetch_title ⇒ Object
- #fetch_version ⇒ Object
- #get_doc ⇒ Object
-
#initialize(acronym, version = nil, spec = nil) ⇒ Scraper
constructor
A new instance of Scraper.
- #item ⇒ Object
- #pub_date ⇒ Object
Constructor Details
#initialize(acronym, version = nil, spec = nil) ⇒ Scraper
Returns a new instance of Scraper.
7 8 9 10 11 |
# File 'lib/relaton_omg/scraper.rb', line 7 def initialize(acronym, version = nil, spec = nil) @acronym = acronym @version = version @spec = spec end |
Class Method Details
.scrape_page(ref) ⇒ Object
13 14 15 16 17 18 19 20 21 22 |
# File 'lib/relaton_omg/scraper.rb', line 13 def self.scrape_page(ref) %r{^OMG (?<acronym>[^\s]+)(?:[\s/](?<version>[\d.]+(?:\sbeta(?:\s\d)?)?))?(?:[\s/](?<spec>\w+))?$} =~ ref return unless acronym scraper = new(acronym, version, spec) doc = scraper.get_doc return if doc.nil? || scraper.fetch_link.empty? OmgBibliographicItem.new(**scraper.item) end |
Instance Method Details
#doc_version ⇒ Object
78 79 80 |
# File 'lib/relaton_omg/scraper.rb', line 78 def doc_version @doc_version ||= @doc.at('//dt[.="Version:"]/following-sibling::dd/p/span').text end |
#fetch_abstract ⇒ Object
69 70 71 72 |
# File 'lib/relaton_omg/scraper.rb', line 69 def fetch_abstract content = @doc.at('//section[@id="document-metadata"]/div/div/p').text [{ content: content, language: "en", script: "Latn" }] end |
#fetch_date ⇒ Object
82 83 84 |
# File 'lib/relaton_omg/scraper.rb', line 82 def fetch_date [type: "published", on: pub_date.to_s] end |
#fetch_docid ⇒ Object
62 63 64 65 66 67 |
# File 'lib/relaton_omg/scraper.rb', line 62 def fetch_docid id = ["OMG", @acronym] id << doc_version if doc_version id << @spec if @spec [RelatonBib::DocumentIdentifier.new(id: id.join(" "), type: "OMG", primary: true)] end |
#fetch_id ⇒ Object
51 52 53 |
# File 'lib/relaton_omg/scraper.rb', line 51 def fetch_id "#{@acronym}#{doc_version}#{@spec}" end |
#fetch_keyword ⇒ Object
126 127 128 |
# File 'lib/relaton_omg/scraper.rb', line 126 def fetch_keyword @doc.xpath('//dt[.="Categories:"]/following-sibling::dd/ul/li/a/em').map &:text end |
#fetch_license ⇒ Object
130 131 132 133 134 |
# File 'lib/relaton_omg/scraper.rb', line 130 def fetch_license @doc.xpath( '//dt/span/a[contains(., "IPR Mode")]/../../following-sibling::dd/span', ).map { |l| l.text.match(/[\w\s-]+/).to_s.strip } end |
#fetch_link ⇒ Object
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/relaton_omg/scraper.rb', line 96 def fetch_link return @link if @link @links = [] if @spec a = @doc.at("//a[@href='#{@url}/#{@spec}/PDF']") @links << { type: "src", content: a[:href] } if a else a = @doc.at('//dt[.="This Document:"]/following-sibling::dd/a') @links << { type: "src", content: a[:href] } if a pdf = @doc.at('//a[@class="download-document"]') @links << { type: "pdf", content: pdf[:href] } if pdf end @links end |
#fetch_relation ⇒ Object
112 113 114 115 116 117 118 119 120 121 122 123 124 |
# File 'lib/relaton_omg/scraper.rb', line 112 def fetch_relation v = @doc.xpath('//h2[.="History"]/following-sibling::section/div/table/tbody/tr') v.reduce([]) do |mem, row| ver = row.at("td").text unless ver == doc_version acronym = row.at("td[3]/a")[:href].split("/")[4] fref = RelatonBib::FormattedRef.new content: "OMG #{acronym} #{ver}" bibitem = OmgBibliographicItem.new formattedref: fref mem << { type: "obsoletes", bibitem: bibitem } end mem end end |
#fetch_status ⇒ Object
90 91 92 93 94 |
# File 'lib/relaton_omg/scraper.rb', line 90 def fetch_status status = @doc.at('//dt[.="Document Status:"]/following-sibling::dd') stage = status.text.strip.match(/\w+/).to_s RelatonBib::DocumentStatus.new(stage: stage) end |
#fetch_title ⇒ Object
55 56 57 58 59 60 |
# File 'lib/relaton_omg/scraper.rb', line 55 def fetch_title content = @doc.at('//dt[.="Title:"]/following-sibling::dd').text content += ": #{@spec}" if @spec title = RelatonBib::FormattedString.new content: content, language: "en", script: "Latn" [RelatonBib::TypedTitleString.new(type: "main", title: title)] end |
#fetch_version ⇒ Object
74 75 76 |
# File 'lib/relaton_omg/scraper.rb', line 74 def fetch_version [RelatonBib::BibliographicItem::Version.new(pub_date, doc_version)] end |
#get_doc ⇒ Object
24 25 26 27 28 29 30 31 32 |
# File 'lib/relaton_omg/scraper.rb', line 24 def get_doc @url = "#{URL_PATTERN}#{@acronym}/" @url += @version.gsub(' ', '/') if @version @doc = Nokogiri::HTML OpenURI.open_uri(@url, open_timeout: 10) rescue OpenURI::HTTPError, URI::InvalidURIError, Net::OpenTimeout => e return if e.is_a?(URI::InvalidURIError) || e.io.status[0] == "404" raise RelatonBib::RequestError, "Unable acces #{@url} (#{e.io.status.join(' ')})" end |
#item ⇒ Object
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# File 'lib/relaton_omg/scraper.rb', line 34 def item { id: fetch_id, fetched: Date.today.to_s, docid: fetch_docid, title: fetch_title, abstract: fetch_abstract, version: fetch_version, date: fetch_date, docstatus: fetch_status, link: fetch_link, relation: fetch_relation, keyword: fetch_keyword, license: fetch_license, } end |
#pub_date ⇒ Object
86 87 88 |
# File 'lib/relaton_omg/scraper.rb', line 86 def pub_date Date.parse @doc.at('//dt[.="Publication Date:"]/following-sibling::dd').text.strip end |