Class: OAI::Harvester::Harvest
- Inherits:
-
Object
- Object
- OAI::Harvester::Harvest
- Defined in:
- lib/oai/harvester/harvest.rb,
lib/oai/harvester/logging.rb
Instance Method Summary collapse
-
#initialize(*args) ⇒ Harvest
constructor
A new instance of Harvest.
- #orig_call ⇒ Object
- #orig_harvest ⇒ Object
-
#orig_init ⇒ Harvest
A new instance of Harvest.
- #orig_start ⇒ Object
- #start(sites = nil, interactive = false) ⇒ Object
Constructor Details
#initialize(*args) ⇒ Harvest
Returns a new instance of Harvest.
9 10 11 12 13 14 15 |
# File 'lib/oai/harvester/harvest.rb', line 9 def initialize(config = nil, directory = nil, date = nil) @config = config || Config.load @directory = directory || @config.storage @from = date @from.freeze @parser = defined?(XML::Document) ? 'libxml' : 'rexml' end |
Instance Method Details
#orig_call ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/oai/harvester/logging.rb', line 8 def call(url, opts) # Preserve original options = opts.dup records = 0; client = OAI::Client.new(url, :parser => @parser) provider_config = client.identify file = Tempfile.new('oai_data') gz = Zlib::GzipWriter.new(file) gz << "<? xml version=\"1.0\" encoding=\"UTF-8\" ?>\n" gz << "<records>" begin response = client.list_records() get_records(response.doc).each do |rec| gz << rec records += 1 end puts "#{records} records retrieved" if @interactive # Get a full response by iterating with the resumption tokens. # Not very Ruby like. Should fix OAI::Client to handle resumption # tokens internally. while(response.resumption_token and not response.resumption_token.empty?) puts "\nresumption token recieved, continuing" if @interactive response = client.list_records(:resumption_token => response.resumption_token) get_records(response.doc).each do |rec| gz << rec records += 1 end puts "#{records} records retrieved" if @interactive end gz << "</records>" ensure gz.close file.close end [file, records] end |
#orig_harvest ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/oai/harvester/logging.rb', line 7 def harvest(site) opts = (@config.sites[site]) harvest_time = Time.now.utc if "YYYY-MM-DD" == granularity(opts[:url]) opts[:until] = harvest_time.strftime("%Y-%m-%d") opts[:from] = @from.strftime("%Y-%m-%d") if @from else opts[:until] = harvest_time.xmlschema opts[:from] = @from.xmlschema if @from end # Allow a from date to be passed in opts[:from] = earliest(opts[:url]) unless opts[:from] opts.delete(:set) if 'all' == opts[:set] begin # Connect, and download file, records = call(opts.delete(:url), opts) # Move document to storage directory dir = File.join(@directory, date_based_directory(harvest_time)) FileUtils.mkdir_p dir FileUtils.mv(file.path, File.join(dir, "#{site}-#{filename(Time.parse(opts[:from]), harvest_time)}.xml.gz")) @config.sites[site]['last'] = harvest_time rescue raise $! unless $!.respond_to?(:code) raise $! if not @interactive || "noRecordsMatch" != $!.code puts "No new records available" end end |
#orig_init ⇒ Harvest
Returns a new instance of Harvest.
9 10 11 12 13 14 15 |
# File 'lib/oai/harvester/logging.rb', line 9 def initialize(config = nil, directory = nil, date = nil) @config = config || Config.load @directory = directory || @config.storage @from = date @from.freeze @parser = defined?(XML::Document) ? 'libxml' : 'rexml' end |
#orig_start ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/oai/harvester/logging.rb', line 6 def start(sites = nil, interactive = false) @interactive = interactive sites = (@config.sites.keys rescue {}) unless sites begin sites.each do |site| harvest(site) end ensure @config.save end end |
#start(sites = nil, interactive = false) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/oai/harvester/harvest.rb', line 17 def start(sites = nil, interactive = false) @interactive = interactive sites = (@config.sites.keys rescue {}) unless sites begin sites.each do |site| harvest(site) end ensure @config.save end end |