Class: DataLoader::Inspector

Inherits:
Object
  • Object
show all
Defined in:
lib/data_loader/inspector.rb

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.row_sepObject (readonly)

set after inspect_file



13
14
15
# File 'lib/data_loader/inspector.rb', line 13

def row_sep
  @row_sep
end

Class Method Details

.dbtype(value) ⇒ Object

determine what datatype is most suitable for the value



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# File 'lib/data_loader/inspector.rb', line 87

def self.dbtype(value)
  if value.is_a?(Fixnum)
    :integer
  elsif value.is_a?(DateTime)
    :datetime
  elsif value.is_a?(String)
    if value.blank?
      nil
    elsif value.length <= 255
      :string
    else
      :text
    end
  elsif value.nil?
    nil
  else
    raise 'Unknown type'
  end
end

.inspect_file(file, separator = ',', inspect_rows = 10, hints = {}) ⇒ Object

read a csv and return the columns and types in an ordered array



17
18
19
20
21
22
23
24
25
26
27
28
29
# File 'lib/data_loader/inspector.rb', line 17

def self.inspect_file(file, separator = ',', inspect_rows = 10, hints = {})
  fields = nil
  FasterCSV.open(file,
    :col_sep => separator,
    :converters => [:date_time, :integer],    # :integer, :float, :date, :date_time
    :headers => true,
    :header_converters => lambda {|h| h.underscore.gsub(/[^a-z0-9_]/, ' ').strip.gsub(' ', '_').squeeze('_') },
    :skip_blanks => true) do |csv|
      @row_sep = csv.row_sep
      fields = scan_rows(csv, inspect_rows, hints)
  end
  fields
end

.promote_type(*types) ⇒ Object

given two datatypes choose what fits them both



108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/data_loader/inspector.rb', line 108

def self.promote_type(*types)
  types.compact!
  if types.empty?
    nil
  elsif (types - [:text, :string, :datetime, :integer]).length > 0 # unknown types
    raise 'Unknown type'
  elsif Set.new(types).length == 1  # one type
    types.first
  elsif types.include?(:text)
    :text
  else
    :string
  end
end

.scan_rows(csv, inspect_rows, hints = {}) ⇒ Object

scan a few rows to determine data types



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# File 'lib/data_loader/inspector.rb', line 32

def self.scan_rows(csv, inspect_rows, hints = {})
  first_row = nil
  columns = {}  # unordered hash containing data types for each header

  1.upto(inspect_rows) do
    begin
      row = csv.gets
      break unless row
      row.each do |header, value|
        columns[header] = promote_type(columns[header], dbtype(value))
      end
      first_row ||= row # save for later
    rescue FasterCSV::MalformedCSVError => boom
      # Don't care about the error but let's retry, since fastercsv will skip this line
      retry
    end
  end

  # form an ordered array based on the first row read:
  fields = []
  first_row.each do |header, value|
    data_type = columns[header]
    fields << {:name => header, :type => data_type}
  end

  # validate hints
  hints.stringify_keys!
  invalid_columns = hints.keys - fields.map {|f| f[:name]}
  puts "Warning: hint column(s) not found: #{invalid_columns.join(', ')}" unless invalid_columns.empty?
  invalid_types = hints.values - [:text, :string, :datetime, :integer]
  abort "Error: hint types(s) are invalid: #{invalid_types.join(', ')}" unless invalid_types.empty?

  fields.each do |field|
    name, field_type = field[:name], field[:type]
    # override columns with hints
    if hints.has_key?(name)
      hint_type = hints[name].to_sym
      if field_type.nil?
        puts "Note: undertermined type for #{name} hinted as #{hint_type}."
      elsif hint_type != field_type
        puts "Note: overriding type #{field_type} for #{name} with #{hint_type}."
      end
      field[:type] = hint_type
    end
    # default to :string if everything was nil (and no hint)
    if field[:type].nil?
      puts "Warning: type could not be determined for #{name}, defaulting to string."
      field[:type] = :string
    end
  end

  fields
end