Module: RelatonIso::Scrapper

Defined in:
lib/relaton_iso/scrapper.rb

Overview

Scrapper.

Constant Summary collapse

DOMAIN =

rubocop:disable Metrics/ModuleLength

"https://www.iso.org"
TYPES =
{
  "TS" => "technical-specification",
  "TR" => "technical-report",
  "PAS" => "publicly-available-specification",
  # "AWI" => "approvedWorkItem",
  # "CD" => "committeeDraft",
  # "FDIS" => "finalDraftInternationalStandard",
  # "NP" => "newProposal",
  # "DIS" => "draftInternationalStandard",
  # "WD" => "workingDraft",
  # "R" => "recommendation",
  "Guide" => "guide",
}.freeze
STGABBR =
{
  "00" => "NWIP",
  "10" => "AWI",
  "20" => "WD",
  "30" => "CD",
  "40" => "DIS",
  "50" => "FDIS",
  "60" => { "00" => "PRF", "60" => "FINAL" },
}.freeze
PUBLISHERS =
{
  "IEC" => { name: "International Electrotechnical Commission",
             url: "www.iec.ch" },
  "ISO" => { name: "International Organization for Standardization",
             url: "www.iso.org" },
  "IEEE" => { name: "Institute of Electrical and Electronics Engineers",
              url: "www.ieee.org" },
  "SAE" => { name: "SAE International", url: "www.sae.org" },
  "CIE" => { name: " International Commission on Illumination",
             url: "cie.co.at" },
  "ASME" => { name: "American Society of Mechanical Engineers",
              url: "www.asme.org" },
}.freeze

Class Method Summary collapse

Class Method Details

.fetch_relaton_docids(doc, pubid) ⇒ Array<RelatonBib::DocumentIdentifier>

Create document ids.

Parameters:

  • doc (Nokogiri::HTML::Document)

    document to parse

  • pubid (Pubid::Iso::Identifier)

    publication identifier

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


96
97
98
99
100
101
102
103
# File 'lib/relaton_iso/scrapper.rb', line 96

def fetch_relaton_docids(doc, pubid)
  pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
  [
    RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
    RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
    RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
  ]
end

.isoref(pubid) ⇒ String

Create ISO reference identifier with English language.

Parameters:

  • pubid (Pubid::Iso::Identifier)

    publication identifier

Returns:

  • (String)

    English reference identifier



112
113
114
115
# File 'lib/relaton_iso/scrapper.rb', line 112

def isoref(pubid)
  params = pubid.get_params.reject { |k, _| k == :typed_stage }
  Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
end

.parse_page(hit, lang = nil) ⇒ RelatonIsoBib::IsoBibliographicItem

Parse page.

Parameters:

Returns:

  • (RelatonIsoBib::IsoBibliographicItem)


51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/relaton_iso/scrapper.rb', line 51

def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  # path = "/contents/data/standard#{hit_data['splitPath']}/"\
  # "#{hit_data['csnumber']}.html"

  path = hit.hit[:path].sub("/sites/isoorg", "")
  doc, url = get_page "#{path}.html"

  # Fetch edition.
  edition = doc.at("//div[div[.='Edition']]/text()[last()]")
    &.text&.match(/\d+$/)&.to_s
  hit.pubid.base.edition ||= edition if hit.pubid.base

  titles, abstract, langs = fetch_titles_abstract(doc, lang)

  RelatonIsoBib::IsoBibliographicItem.new(
    fetched: Date.today.to_s,
    docid: fetch_relaton_docids(doc, hit.pubid),
    docnumber: fetch_docnumber(hit.pubid),
    edition: edition,
    language: langs.map { |l| l[:lang] },
    script: langs.map { |l| script(l[:lang]) }.uniq,
    title: titles,
    doctype: fetch_type(hit.hit[:title]),
    docstatus: fetch_status(doc),
    ics: fetch_ics(doc),
    date: fetch_dates(doc, hit.hit[:title]),
    contributor: fetch_contributors(hit.hit[:title]),
    editorialgroup: fetch_workgroup(doc),
    abstract: abstract,
    copyright: fetch_copyright(doc),
    link: fetch_link(doc, url),
    relation: fetch_relations(doc),
    place: ["Geneva"],
    structuredidentifier: fetch_structuredidentifier(hit.pubid),
  )
end