Class: Idata::Detector

Inherits:
Object
  • Object
show all
Defined in:
lib/idata/detector.rb

Constant Summary collapse

DEFAULT_DELIMITER =
","
COMMON_DELIMITERS =
[DEFAULT_DELIMITER, "|", "\t", ";"]
SAMPLE_SIZE =
100

Instance Method Summary collapse

Constructor Details

#initialize(file) ⇒ Detector

Returns a new instance of Detector.



23
24
25
26
27
28
29
30
# File 'lib/idata/detector.rb', line 23

def initialize(file)
  @file = file
  @sample = `head -n #{SAMPLE_SIZE} #{@file}`.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
  @sample_lines = @sample.split(/[\r\n]+/)
  @candidates = COMMON_DELIMITERS.map { |delim|
    [delim, @sample.scan(delim).count]
  }.to_h.select{|k,v| v > 0}
end

Instance Method Details

#findObject



32
33
34
35
# File 'lib/idata/detector.rb', line 32

def find
  return DEFAULT_DELIMITER if @candidates.empty? # for example, file with only one header
  return find_same_occurence || find_valid || find_max_occurence || DEFAULT_DELIMITER
end

#find_max_occurenceObject

most occurence



67
68
69
70
71
72
# File 'lib/idata/detector.rb', line 67

def find_max_occurence
  selected = @candidates.select{|k,v| v == @candidates.sort_by(&:last).last }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end

#find_same_occurenceObject

high confident level



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/idata/detector.rb', line 53

def find_same_occurence
  selected = @candidates.select { |delim, count|
    begin
      CSV.parse(@sample, col_sep: delim).select{|e| !e.empty? }.map{|e| e.count}.uniq.count == 1
    rescue Exception => ex
      false
    end
  }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end

#find_validObject

just work



38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/idata/detector.rb', line 38

def find_valid
  selected = @candidates.select { |delim, count|
    begin
      CSV.parse(@sample, col_sep: delim)
      true
    rescue Exception => ex
      false
    end
  }.keys

  return selected.first if selected.count == 1
  return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER)
end