Class: DataKit::CSV::SchemaAnalyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/data_kit/csv/schema_analyzer.rb

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(csv, options = {}) ⇒ SchemaAnalyzer

Returns a new instance of SchemaAnalyzer.



9
10
11
12
13
14
15
16
17
18
19
# File 'lib/data_kit/csv/schema_analyzer.rb', line 9

def initialize(csv, options = {})
  @csv = csv
  @keys = options[:keys] || []
  @sampling_rate = options[:sampling_rate] || 0.1

  if options[:use_type_hints].nil? || options[:use_type_hints] == true
    @use_type_hints = true
  else
    @use_type_hints = false
  end
end

Instance Attribute Details

#csvObject

Returns the value of attribute csv.



4
5
6
# File 'lib/data_kit/csv/schema_analyzer.rb', line 4

def csv
  @csv
end

#keysObject

Returns the value of attribute keys.



5
6
7
# File 'lib/data_kit/csv/schema_analyzer.rb', line 5

def keys
  @keys
end

#sampling_rateObject

Returns the value of attribute sampling_rate.



6
7
8
# File 'lib/data_kit/csv/schema_analyzer.rb', line 6

def sampling_rate
  @sampling_rate
end

#use_type_hintsObject

Returns the value of attribute use_type_hints.



7
8
9
# File 'lib/data_kit/csv/schema_analyzer.rb', line 7

def use_type_hints
  @use_type_hints
end

Class Method Details

.analyze(csv, options = {}) ⇒ Object



45
46
47
48
49
50
51
52
53
# File 'lib/data_kit/csv/schema_analyzer.rb', line 45

def analyze(csv, options = {})
  analyzer = new(csv,
    :keys => options[:keys],
    :sampling_rate => options[:sampling_rate],
    :use_type_hints => options[:use_type_hints]
  )

  analyzer.execute
end

.sampling_rate(file_size) ⇒ Object



55
56
57
58
59
60
61
62
# File 'lib/data_kit/csv/schema_analyzer.rb', line 55

def sampling_rate(file_size)
  if file_size < (1024 * 1024)
    sampling_rate = 1.0
  else
    scale_factor = 500
    sampling_rate = (scale_factor / Math.sqrt(file_size)).round(4)
  end
end

Instance Method Details

#executeObject



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/data_kit/csv/schema_analyzer.rb', line 21

def execute
  first = true
  analysis = nil
  random = Random.new

  csv.each_row do |row|
    if first
      first = false
      analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
    end

    analysis.increment_total
    if random.rand <= sampling_rate
      analysis.increment_sample
      row.each_with_index do |value, index|
        analysis.insert(csv.headers[index].to_s, value)
      end
    end
  end

  analysis
end