Module: RelatonGb::Scrapper

Included in:
GbScrapper, SecScrapper, TScrapper
Defined in:
lib/relaton_gb/scrapper.rb

Overview

Common scrapping methods.

Constant Summary collapse

STAGES =
{ "即将实施" => "published",
"现行" => "activated",
"废止" => "obsoleted",
"被代替" => "replaced" }.freeze

Instance Method Summary collapse

Instance Method Details

#fetch_structuredidentifier(docref) ⇒ RelatonIsoBib::StructuredIdentifier

Parameters:

  • docref (String)

Returns:

  • (RelatonIsoBib::StructuredIdentifier)


49
50
51
52
53
54
55
# File 'lib/relaton_gb/scrapper.rb', line 49

def fetch_structuredidentifier(docref)
  m = docref.match(/^([^–—.-]*\d+)\.?((?<=\.)\d+|)/)
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: m[1], part_number: m[2], prefix: nil,
    id: docref, type: "Chinese Standard"
  )
end

#get_contributors(doc, docref) ⇒ Array<Hash>

Parameters:

  • doc (Nokogiri::HTML::Document)
  • docref (Strings)

Returns:

  • (Array<Hash>)


60
61
62
63
64
65
66
67
68
69
# File 'lib/relaton_gb/scrapper.rb', line 60

def get_contributors(doc, docref)
  name = docref.match(/^[^\s]+/).to_s
  name.sub!(%r{/[TZ]$}, "") unless name =~ /^GB/
  gbtype = get_gbtype(doc, docref)
  orgs = %w[en zh].map { |l| org(l, name, gbtype) }.compact
  return [] unless orgs.any?

  entity = RelatonBib::Organization.new name: orgs
  [{ entity: entity, role: [type: "publisher"] }]
end

#get_docid(docref) ⇒ Array<RelatonBib::DocumentIdentifier>

Parameters:

  • docref (String)

Returns:

  • (Array<RelatonBib::DocumentIdentifier>)


43
44
45
# File 'lib/relaton_gb/scrapper.rb', line 43

def get_docid(docref)
  [RelatonBib::DocumentIdentifier.new(id: docref, type: "Chinese Standard", primary: true)]
end

#get_status(doc, status = nil) ⇒ RelatonBib::DocumentStatus

Parameters:

  • doc (Nokogiri::HTML::Document)
  • status (String, NilClass) (defaults to: nil)

Returns:

  • (RelatonBib::DocumentStatus)


101
102
103
104
105
106
# File 'lib/relaton_gb/scrapper.rb', line 101

def get_status(doc, status = nil)
  status ||= doc.at("//td[contains(., '标准状态')]/span")&.text&.strip
  return unless STAGES[status]

  RelatonBib::DocumentStatus.new stage: STAGES[status]
end

#get_titles(doc) ⇒ Array<RelatonBib::TypedTitleString>

Parameters:

  • doc (Nokogiri::HTML::Document)

Returns:

  • (Array<RelatonBib::TypedTitleString>)


85
86
87
88
89
90
91
92
# File 'lib/relaton_gb/scrapper.rb', line 85

def get_titles(doc)
  tzh = doc.at("//td[contains(text(), '中文标准名称')]/b").text
  titles = RelatonBib::TypedTitleString.from_string tzh, "zh", "Hans"
  ten = doc.at("//td[contains(text(), '英文标准名称')]").text.match(/[\w\s]+/).to_s
  return titles if ten.empty?

  titles + RelatonBib::TypedTitleString.from_string(ten, "en", "Latn")
end

#get_typeObject



94
95
96
# File 'lib/relaton_gb/scrapper.rb', line 94

def get_type
  DocumentType.new type: "standard"
end

#org(lang, name, gbtype) ⇒ Hash

Parameters:

  • lang (String)
  • name (String)
  • gbtype (Hash)

Returns:

  • (Hash)


75
76
77
78
79
80
81
# File 'lib/relaton_gb/scrapper.rb', line 75

def org(lang, name, gbtype)
  ag = GbAgencies::Agencies.new(lang, {}, "")
  content = ag.standard_agency1(gbtype[:scope], name, gbtype[:mandate])
  return unless content

  { language: lang, content: content }
end

#scrapped_data(doc, src, hit) ⇒ Hash

Parameters:

Returns:

  • (Hash)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/relaton_gb/scrapper.rb', line 21

def scrapped_data(doc, src, hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  {
    fetched: Date.today.to_s,
    committee: get_committee(doc, hit.docref),
    docid: get_docid(hit.docref),
    title: get_titles(doc),
    contributor: get_contributors(doc, hit.docref),
    doctype: get_type,
    docstatus: get_status(doc, hit.status),
    gbtype: get_gbtype(doc, hit.docref),
    ccs: get_ccs(doc),
    ics: get_ics(doc),
    link: [{ type: "src", content: src }],
    date: get_dates(doc),
    language: ["zh"],
    script: ["Hans"],
    structuredidentifier: fetch_structuredidentifier(hit.docref),
  }
end