Class: DataPackage::Interpreter

Inherits:
Object
  • Object
show all
Defined in:
lib/datapackage/interpreter.rb

Constant Summary collapse

INFER_THRESHOLD =
10
INFER_CONFIDENCE =
0.75
YEAR_PATTERN =
/[12]\d{3}/
DATE_PATTERN =
/(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})|(\d{4}[-\/]\d{1,2}[-\/]\d{1,2})/
DATETIME_PATTERN =
/(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}|\d{4}[-\/]\d{1,2}[-\/]\d{1,2}).\d{1,2}:\d{2}/
TIME_PATTERN =
/^\d{1,2}((:\d{1,2})|(am|pm|AM|PM))$/
INTEGER_PATTERN =
/^\d+$/
DEFAULT_TYPE_FORMAT =
{'type' => 'any', 'format' => 'default'}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(csv) ⇒ Interpreter

Returns a new instance of Interpreter.



14
15
16
17
# File 'lib/datapackage/interpreter.rb', line 14

def initialize(csv)
  @csv = csv
  @threshold = [csv.length, INFER_THRESHOLD].min
end

Instance Attribute Details

#csvObject (readonly)

Returns the value of attribute csv.



12
13
14
# File 'lib/datapackage/interpreter.rb', line 12

def csv
  @csv
end

#thresholdObject (readonly)

Returns the value of attribute threshold.



12
13
14
# File 'lib/datapackage/interpreter.rb', line 12

def threshold
  @threshold
end

Instance Method Details

#inspect_value(value) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/datapackage/interpreter.rb', line 40

def inspect_value(value)
  return DEFAULT_TYPE_FORMAT unless value.is_a?(String)

  if value.length == 4 && value.match(YEAR_PATTERN)
    return { 'type' => 'year', 'format' => 'default' }
  end

  if value.match(DATETIME_PATTERN)
    return { 'type' => 'datetime', 'format' => 'default' }
  end

  if value.match(DATE_PATTERN)
    return { 'type' => 'date', 'format' => 'default' }
  end

  if value.match(TIME_PATTERN)
    return { 'type' => 'time', 'format' => 'default' }
  end

  if value.match(INTEGER_PATTERN)
    return { 'type' => 'integer', 'format' => 'default' }
  end

  DEFAULT_TYPE_FORMAT
end

#type_and_format_at(header) ⇒ Object



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/datapackage/interpreter.rb', line 19

def type_and_format_at(header)
  values = csv.values_at(header).flatten
  counter = {}
  type_and_format = DEFAULT_TYPE_FORMAT

  values.each_with_index do |value, i|
    inspection_count = i + 1

    inspection = inspect_value(value)
    counter[inspection] = (counter[inspection] || 0) + 1
    if inspection_count >= threshold
      if counter[inspection] / inspection_count >= INFER_CONFIDENCE
        type_and_format = inspection
        break
      end
    end
  end

  type_and_format
end