Class: RelatonItu::DataFetcher

Inherits:
Object
  • Object
show all
Defined in:
lib/relaton_itu/data_fetcher.rb

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(output, format) ⇒ DataFetcher

Returns a new instance of DataFetcher.



3
4
5
6
7
# File 'lib/relaton_itu/data_fetcher.rb', line 3

def initialize(output, format)
  @output = output
  @format = format
  @ext = format.sub "bibxml", "xml"
end

Class Method Details

.fetch(output: "data", format: "yaml") ⇒ Object



42
43
44
45
46
47
48
49
50
# File 'lib/relaton_itu/data_fetcher.rb', line 42

def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end

Instance Method Details

#agentObject



18
19
20
# File 'lib/relaton_itu/data_fetcher.rb', line 18

def agent
  @agent ||= Mechanize.new
end

#content(bib) ⇒ Object



138
139
140
141
142
143
144
# File 'lib/relaton_itu/data_fetcher.rb', line 138

def content(bib)
  case @format
  when "yaml" then bib.to_hash.to_yaml
  when "xml" then bib.to_xml bibdata: true
  when "bibxml" then bib.to_bibxml
  end
end

#fetchObject



52
53
54
55
56
57
58
59
60
61
# File 'lib/relaton_itu/data_fetcher.rb', line 52

def fetch
  fetch_recommendation
  fetch_question
  fetch_report
  fetch_handbook
  fetch_resolution
  workers.end
  workers.result
  index.save
end

#fetch_handbookObject



92
93
94
95
# File 'lib/relaton_itu/data_fetcher.rb', line 92

def fetch_handbook
  url = "https://extranet.itu.int/brdocsearch/R-HDB/Forms/Folders%20InForce.aspx"
  html_index url, "handbook"
end

#fetch_questionObject



75
76
77
78
# File 'lib/relaton_itu/data_fetcher.rb', line 75

def fetch_question
  url = "https://extranet.itu.int/brdocsearch/R-QUE/Forms/folders_inforce.aspx"
  html_index url, "question"
end

#fetch_recommendationObject



63
64
65
66
67
68
69
70
71
72
73
# File 'lib/relaton_itu/data_fetcher.rb', line 63

def fetch_recommendation
  url = "https://extranet.itu.int/brdocsearch/_layouts/15/inplview.aspx?" \
        "List=%7B0661B581-2413-4E84-BAB2-77E6DB27AF7F%7D&" \
        "View=%7BC81191DD-48C4-4881-9CB7-FB61C683FE98%7D&" \
        "ViewCount=123&" \
        "IsXslView=TRUE&" \
        "IsCSR=TRUE&" \
        "ListViewPageUrl=https%3A%2F%2Fextranet.itu.int%2Fbrdocsearch%2FR-REC%2FForms%2Ffolders_inforce.aspx&" \
        "FolderCTID=0x012001"
  json_index url, "recommendation"
end

#fetch_reportObject



80
81
82
83
84
85
86
87
88
89
90
# File 'lib/relaton_itu/data_fetcher.rb', line 80

def fetch_report
  url = "https://extranet.itu.int/brdocsearch/_layouts/15/inplview.aspx?" \
        "List=%7B82E4A13D-C7F3-4844-9E8A-2463C4B7784F%7D&" \
        "View=%7B94CC1561-E4AC-4317-B402-AA0AADD7F414%7D&" \
        "ViewCount=407&" \
        "IsXslView=TRUE&" \
        "IsCSR=TRUE&" \
        "ListViewPageUrl=https%3A%2F%2Fextranet.itu.int%2Fbrdocsearch%2FR-REP%2FForms%2FFolders%2520InForce.aspx&" \
        "FolderCTID=0x012001"
  json_index url, "technical-report"
end

#fetch_resolutionObject



97
98
99
100
# File 'lib/relaton_itu/data_fetcher.rb', line 97

def fetch_resolution
  url = "https://extranet.itu.int/brdocsearch/R-RES/Forms/Folders%20InForce.aspx"
  html_index url, "resolution"
end

#filesObject



9
10
11
# File 'lib/relaton_itu/data_fetcher.rb', line 9

def files
  @files ||= []
end

#html_index(url, type) ⇒ Object

#param url [String]

Parameters:

  • type (String)


116
117
118
119
120
121
122
123
# File 'lib/relaton_itu/data_fetcher.rb', line 116

def html_index(url, type)
  resp = agent.get url
  result = Nokogiri::HTML resp.body
  result.xpath("//table//table/tr[position() > 1]").each do |hit|
    url = hit.at("td/a")[:onclick].match(%r{https://[^']+}).to_s
    workers << [url, type]
  end
end

#indexObject



13
14
15
# File 'lib/relaton_itu/data_fetcher.rb', line 13

def index
  @index ||= Relaton::Index.find_or_create :itu, file: "index-v1.yaml"
end

#json_index(url, type) ⇒ Object

#param url [String]

Parameters:

  • type (String)


104
105
106
107
108
109
110
111
112
# File 'lib/relaton_itu/data_fetcher.rb', line 104

def json_index(url, type) # rubocop:disable Metrics/AbcSize
  result = agent.post url
  json = JSON.parse result.body
  json["Row"].each { |row| workers << [row["serverurl.progid"].sub(/^1/, ""), type] }
  return unless json["NextHref"]

  nexturl = url.sub(/(Paged|FolderCTID)=.+/, json["NextHref"].match(/(?<=aspx\?).+/).to_s)
  json_index nexturl, type
end

#parse_page(url, type) ⇒ Object

Parameters:

  • url (String)
  • type (String)


34
35
36
37
38
39
40
# File 'lib/relaton_itu/data_fetcher.rb', line 34

def parse_page(url, type)
  doc = agent.get url
  bib = DataParserR.parse doc, url, type
  write_file bib
rescue => e # rubocop:disable Style/RescueStandardError
  Util.error "#{e.message}\n#{e.backtrace}"
end

#workersObject



23
24
25
26
27
28
29
30
# File 'lib/relaton_itu/data_fetcher.rb', line 23

def workers
  return @workers if @workers

  @workers = RelatonBib::WorkersPool.new 10
  @workers.tap do |w|
    w.worker { |row| parse_page(*row) }
  end
end

#write_file(bib) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
# File 'lib/relaton_itu/data_fetcher.rb', line 126

def write_file(bib) # rubocop:disable Metrics/AbcSize
  id = bib.docidentifier[0].id.gsub(/[\s.]/, "_")
  file = "#{@output}/#{id}.#{@ext}"
  if files.include? file
    Util.warn "File #{file} exists."
  else
    files << file
  end
  index.add_or_update bib.docidentifier[0].id, file
  File.write file, content(bib), encoding: "UTF-8"
end