Module: Isobib::Scrapper
- Defined in:
- lib/isobib/scrapper.rb
Overview
Scrapper. rubocop:disable Metrics/ModuleLength
Constant Summary collapse
- DOMAIN =
'https://www.iso.org'
- TYPES =
{ 'TS' => 'technicalSpecification', 'TR' => 'technicalReport', 'PAS' => 'publiclyAvailableSpecification', 'AWI' => 'approvedWorkItem', 'CD' => 'committeeDraft', 'FDIS' => 'finalDraftInternationalStandard', 'NP' => 'newProposal', 'DIS' => 'draftInternationalStandard', 'WD' => 'workingDraft', 'R' => 'recommendation', 'Guide' => 'guide' }.freeze
Class Method Summary collapse
- .get(text) ⇒ Array<Hash>
-
.parse_page(hit_data) ⇒ Hash
Parse page.
Class Method Details
.get(text) ⇒ Array<Hash>
42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# File 'lib/isobib/scrapper.rb', line 42 def get(text) begin iso_workers = WorkersPool.new 4 iso_workers.worker { |hit| iso_worker(hit, iso_workers) } algolia_workers = start_algolia_search(text, iso_workers) iso_docs = iso_workers.result algolia_workers.end algolia_workers.result iso_docs rescue warn "Could not connect to http://www.iso.org" [] end end |
.parse_page(hit_data) ⇒ Hash
Parse page. rubocop:disable Metrics/AbcSize, Metrics/MethodLength
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# File 'lib/isobib/scrapper.rb', line 61 def parse_page(hit_data) return unless hit_data['path'].match(/\d+$/) doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html" # Fetch edition. edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..") &.children&.last&.text&.match(/\d+/)&.to_s titles, abstract = fetch_titles_abstract(doc) IsoBibItem::IsoBibliographicItem.new( docid: fetch_docid(doc), edition: edition, language: langs(doc).map { |l| l[:lang] }, script: langs(doc).map { |l| script(l[:lang]) }.uniq, titles: titles, type: fetch_type(hit_data['title']), docstatus: fetch_status(doc, hit_data['status']), ics: fetch_ics(doc), dates: fetch_dates(doc), contributors: fetch_contributors(hit_data['title']), workgroup: fetch_workgroup(doc), abstract: abstract, copyright: fetch_copyright(hit_data['title'], doc), link: fetch_link(doc, url), relations: fetch_relations(doc) ) end |