Module: DarwinCore::Ingester

Included in:
Core, Extension
Defined in:
lib/dwc-archive/ingester.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Instance Attribute Details

#dataObject (readonly)

Returns the value of attribute data.



4
5
6
# File 'lib/dwc-archive/ingester.rb', line 4

def data
  @data
end

#encodingObject (readonly)

Returns the value of attribute encoding.



4
5
6
# File 'lib/dwc-archive/ingester.rb', line 4

def encoding
  @encoding
end

#fieldsObject (readonly)

Returns the value of attribute fields.



5
6
7
# File 'lib/dwc-archive/ingester.rb', line 5

def fields
  @fields
end

#fields_separatorObject (readonly)

Returns the value of attribute fields_separator.



4
5
6
# File 'lib/dwc-archive/ingester.rb', line 4

def fields_separator
  @fields_separator
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



5
6
7
# File 'lib/dwc-archive/ingester.rb', line 5

def file_path
  @file_path
end

#ignore_headersObject (readonly)

Returns the value of attribute ignore_headers.



5
6
7
# File 'lib/dwc-archive/ingester.rb', line 5

def ignore_headers
  @ignore_headers
end

#line_separatorObject (readonly)

Returns the value of attribute line_separator.



5
6
7
# File 'lib/dwc-archive/ingester.rb', line 5

def line_separator
  @line_separator
end

#propertiesObject (readonly)

Returns the value of attribute properties.



4
5
6
# File 'lib/dwc-archive/ingester.rb', line 4

def properties
  @properties
end

#quote_characterObject (readonly)

Returns the value of attribute quote_character.



5
6
7
# File 'lib/dwc-archive/ingester.rb', line 5

def quote_character
  @quote_character
end

#sizeObject (readonly)

Returns the value of attribute size.



4
5
6
# File 'lib/dwc-archive/ingester.rb', line 4

def size
  @size
end

Instance Method Details

#read(batch_size = 10000) {|[res, errors]| ... } ⇒ Object

Yields:

  • ([res, errors])


12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/dwc-archive/ingester.rb', line 12

def read(batch_size = 10000)
  DarwinCore.logger_write(@dwc.object_id, "Reading %s data" % name)
  res = []
  errors = []
  index_fix = 1
  args = {:col_sep => @field_separator}
  @quote_character = "\b" if @quote_character.empty?
  args.merge!({:quote_char => @quote_character})
  min_size = @fields.map {|f| f[:index].to_i || 0}.sort[-1] + 1
  csv = CSV.new(open(@file_path), args)
  csv.each_with_index do |r, i|
    index_fix = 0; next if @ignore_headers && i == 0
    min_size > r.size ? errors << r : process_csv_row(res, errors, r)
    if (i + index_fix) % batch_size == 0
      DarwinCore.logger_write(@dwc.object_id, 
                              "Ingested %s records from %s" % 
                              [(i + index_fix), name])
      if block_given?
        yield [res, errors]
        res = []
        errors = []
      end
    end
  end
  yield [res, errors] if block_given?
  [res, errors]
end