Class: RelatonItu::DataFetcher
- Inherits:
-
Object
- Object
- RelatonItu::DataFetcher
- Defined in:
- lib/relaton_itu/data_fetcher.rb
Class Method Summary collapse
Instance Method Summary collapse
- #agent ⇒ Object
- #content(bib) ⇒ Object
- #fetch ⇒ Object
- #fetch_handbook ⇒ Object
- #fetch_question ⇒ Object
- #fetch_recommendation ⇒ Object
- #fetch_report ⇒ Object
- #fetch_resolution ⇒ Object
- #files ⇒ Object
-
#html_index(url, type) ⇒ Object
#param url [String].
- #index ⇒ Object
-
#initialize(output, format) ⇒ DataFetcher
constructor
A new instance of DataFetcher.
-
#json_index(url, type) ⇒ Object
#param url [String].
- #parse_page(url, type) ⇒ Object
- #workers ⇒ Object
- #write_file(bib) ⇒ Object
Constructor Details
#initialize(output, format) ⇒ DataFetcher
Returns a new instance of DataFetcher.
3 4 5 6 7 |
# File 'lib/relaton_itu/data_fetcher.rb', line 3 def initialize(output, format) @output = output @format = format @ext = format.sub "bibxml", "xml" end |
Class Method Details
.fetch(output: "data", format: "yaml") ⇒ Object
42 43 44 45 46 47 48 49 50 |
# File 'lib/relaton_itu/data_fetcher.rb', line 42 def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end |
Instance Method Details
#agent ⇒ Object
18 19 20 |
# File 'lib/relaton_itu/data_fetcher.rb', line 18 def agent @agent ||= Mechanize.new end |
#content(bib) ⇒ Object
138 139 140 141 142 143 144 |
# File 'lib/relaton_itu/data_fetcher.rb', line 138 def content(bib) case @format when "yaml" then bib.to_hash.to_yaml when "xml" then bib.to_xml bibdata: true when "bibxml" then bib.to_bibxml end end |
#fetch ⇒ Object
52 53 54 55 56 57 58 59 60 61 |
# File 'lib/relaton_itu/data_fetcher.rb', line 52 def fetch fetch_recommendation fetch_question fetch_report fetch_handbook fetch_resolution workers.end workers.result index.save end |
#fetch_handbook ⇒ Object
92 93 94 95 |
# File 'lib/relaton_itu/data_fetcher.rb', line 92 def fetch_handbook url = "https://extranet.itu.int/brdocsearch/R-HDB/Forms/Folders%20InForce.aspx" html_index url, "handbook" end |
#fetch_question ⇒ Object
75 76 77 78 |
# File 'lib/relaton_itu/data_fetcher.rb', line 75 def fetch_question url = "https://extranet.itu.int/brdocsearch/R-QUE/Forms/folders_inforce.aspx" html_index url, "question" end |
#fetch_recommendation ⇒ Object
63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/relaton_itu/data_fetcher.rb', line 63 def fetch_recommendation url = "https://extranet.itu.int/brdocsearch/_layouts/15/inplview.aspx?" \ "List=%7B0661B581-2413-4E84-BAB2-77E6DB27AF7F%7D&" \ "View=%7BC81191DD-48C4-4881-9CB7-FB61C683FE98%7D&" \ "ViewCount=123&" \ "IsXslView=TRUE&" \ "IsCSR=TRUE&" \ "ListViewPageUrl=https%3A%2F%2Fextranet.itu.int%2Fbrdocsearch%2FR-REC%2FForms%2Ffolders_inforce.aspx&" \ "FolderCTID=0x012001" json_index url, "recommendation" end |
#fetch_report ⇒ Object
80 81 82 83 84 85 86 87 88 89 90 |
# File 'lib/relaton_itu/data_fetcher.rb', line 80 def fetch_report url = "https://extranet.itu.int/brdocsearch/_layouts/15/inplview.aspx?" \ "List=%7B82E4A13D-C7F3-4844-9E8A-2463C4B7784F%7D&" \ "View=%7B94CC1561-E4AC-4317-B402-AA0AADD7F414%7D&" \ "ViewCount=407&" \ "IsXslView=TRUE&" \ "IsCSR=TRUE&" \ "ListViewPageUrl=https%3A%2F%2Fextranet.itu.int%2Fbrdocsearch%2FR-REP%2FForms%2FFolders%2520InForce.aspx&" \ "FolderCTID=0x012001" json_index url, "technical-report" end |
#fetch_resolution ⇒ Object
97 98 99 100 |
# File 'lib/relaton_itu/data_fetcher.rb', line 97 def fetch_resolution url = "https://extranet.itu.int/brdocsearch/R-RES/Forms/Folders%20InForce.aspx" html_index url, "resolution" end |
#files ⇒ Object
9 10 11 |
# File 'lib/relaton_itu/data_fetcher.rb', line 9 def files @files ||= [] end |
#html_index(url, type) ⇒ Object
#param url [String]
116 117 118 119 120 121 122 123 |
# File 'lib/relaton_itu/data_fetcher.rb', line 116 def html_index(url, type) resp = agent.get url result = Nokogiri::HTML resp.body result.xpath("//table//table/tr[position() > 1]").each do |hit| url = hit.at("td/a")[:onclick].match(%r{https://[^']+}).to_s workers << [url, type] end end |
#index ⇒ Object
13 14 15 |
# File 'lib/relaton_itu/data_fetcher.rb', line 13 def index @index ||= Relaton::Index.find_or_create :itu, file: "index-v1.yaml" end |
#json_index(url, type) ⇒ Object
#param url [String]
104 105 106 107 108 109 110 111 112 |
# File 'lib/relaton_itu/data_fetcher.rb', line 104 def json_index(url, type) # rubocop:disable Metrics/AbcSize result = agent.post url json = JSON.parse result.body json["Row"].each { |row| workers << [row["serverurl.progid"].sub(/^1/, ""), type] } return unless json["NextHref"] nexturl = url.sub(/(Paged|FolderCTID)=.+/, json["NextHref"].match(/(?<=aspx\?).+/).to_s) json_index nexturl, type end |
#parse_page(url, type) ⇒ Object
34 35 36 37 38 39 40 |
# File 'lib/relaton_itu/data_fetcher.rb', line 34 def parse_page(url, type) doc = agent.get url bib = DataParserR.parse doc, url, type write_file bib rescue => e # rubocop:disable Style/RescueStandardError Util.error "#{e.}\n#{e.backtrace}" end |
#workers ⇒ Object
23 24 25 26 27 28 29 30 |
# File 'lib/relaton_itu/data_fetcher.rb', line 23 def workers return @workers if @workers @workers = RelatonBib::WorkersPool.new 10 @workers.tap do |w| w.worker { |row| parse_page(*row) } end end |
#write_file(bib) ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 |
# File 'lib/relaton_itu/data_fetcher.rb', line 126 def write_file(bib) # rubocop:disable Metrics/AbcSize id = bib.docidentifier[0].id.gsub(/[\s.]/, "_") file = "#{@output}/#{id}.#{@ext}" if files.include? file Util.warn "File #{file} exists." else files << file end index.add_or_update bib.docidentifier[0].id, file File.write file, content(bib), encoding: "UTF-8" end |