Class: TableSchema::Infer

Inherits:
Object
  • Object
show all
Includes:
Helpers
Defined in:
lib/tableschema/infer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Helpers

#deep_symbolize_keys, #get_class_for_type, #read_file, #type_class_lookup

Constructor Details

#initialize(headers, rows, explicit: false, primary_key: nil, row_limit: nil) ⇒ Infer

Returns a new instance of Infer.



11
12
13
14
15
16
17
18
19
20
21
22
23
# File 'lib/tableschema/infer.rb', line 11

def initialize(headers, rows, explicit: false, primary_key: nil, row_limit: nil)
  @headers = headers
  @rows = rows
  @explicit = explicit
  @primary_key = primary_key
  @row_limit = row_limit

  @schema = {
    fields: fields
  }
  @schema[:primaryKey] = @primary_key if @primary_key
  infer!
end

Instance Attribute Details

#schemaObject (readonly)

Returns the value of attribute schema.



9
10
11
# File 'lib/tableschema/infer.rb', line 9

def schema
  @schema
end

Instance Method Details

#available_typesObject



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/tableschema/infer.rb', line 128

def available_types
  [
    'any',
    'string',
    'boolean',
    'number',
    'integer',
    'date',
    'time',
    'datetime',
    'array',
    'object',
    'geopoint',
    'geojson'
  ]
end

#fieldsObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/tableschema/infer.rb', line 25

def fields
  @headers.map do |header|
    descriptor = {
      name: header,
      title: '',
      description: '',
    }

    constraints = {}
    constraints[:required] = @explicit === true
    constraints[:unique] = (header == @primary_key)
    constraints.delete_if { |_,v| v == false } unless @explicit === true
    descriptor[:constraints] = constraints if constraints.count > 0
    TableSchema::Field.new(descriptor)
  end
end

#guess_format(converter, col) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'lib/tableschema/infer.rb', line 89

def guess_format(converter, col)
  guessed_format = TableSchema::DEFAULTS[:format]
  converter.class.instance_methods.grep(/cast_/).each do |method|
    begin
      format = method.to_s
      format.slice!('cast_')
      next if format == TableSchema::DEFAULTS[:format]
      converter.send(method, col)
      guessed_format = format
      break
    rescue TableSchema::Exception
      next
    end
  end
  guessed_format
end

#guess_type(col, index) ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/tableschema/infer.rb', line 69

def guess_type(col, index)
  guessed_type = TableSchema::DEFAULTS[:type]
  guessed_format = TableSchema::DEFAULTS[:format]

  available_types.reverse_each do |type|
    klass = get_class_for_type(type)
    converter = Kernel.const_get(klass).new(@schema[:fields][index])
    if converter.test(col) === true
      guessed_type = type
      guessed_format = guess_format(converter, col)
      break
    end
  end

  {
    type: guessed_type,
    format: guessed_format
  }
end

#infer!Object



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/tableschema/infer.rb', line 42

def infer!
  type_matches = []
  @rows.each_with_index do |row, index|
    break if @row_limit && index > @row_limit
    row = row.fields if row.class == CSV::Row

    row_length = row.count
    headers_length = @headers.count

    if row_length > headers_length
      row = row[0..headers_length]
    elsif row_length < headers_length
      diff = headers_length - row_length
      fill = [''] * diff
      row = row.push(fill).flatten
    end

    row.each_with_index do |col, idx|
      type_matches[idx] ||= []
      type_matches[idx] << guess_type(col, idx)
    end

  end
  resolve_types(type_matches)
  @schema = TableSchema::Schema.new(@schema)
end

#resolve_types(results) ⇒ Object



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/tableschema/infer.rb', line 106

def resolve_types(results)
  results.each_with_index do |result,v|
    result.uniq!

    if result.count == 1
      rv = result[0]
    else
      counts = {}
      result.each do |r|
        counts[r] ||= 0
        counts[r] += 1
      end

      sorted_counts = counts.sort_by {|_key, value| value}
      rv = sorted_counts[0][0]
    end

    @schema[:fields][v].merge!(rv)
  end

end