Class: Idata::Detector
- Inherits:
-
Object
- Object
- Idata::Detector
- Defined in:
- lib/idata/detector.rb
Constant Summary collapse
- DEFAULT_DELIMITER =
","
- COMMON_DELIMITERS =
[DEFAULT_DELIMITER, "|", "\t", ";"]
- SAMPLE_SIZE =
100
Instance Method Summary collapse
- #find ⇒ Object
-
#find_max_occurence ⇒ Object
most occurence.
-
#find_same_occurence ⇒ Object
high confident level.
-
#find_valid ⇒ Object
just work.
-
#initialize(file) ⇒ Detector
constructor
A new instance of Detector.
Constructor Details
#initialize(file) ⇒ Detector
Returns a new instance of Detector.
23 24 25 26 27 28 29 30 |
# File 'lib/idata/detector.rb', line 23 def initialize(file) @file = file @sample = `head -n #{SAMPLE_SIZE} #{@file}`.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '') @sample_lines = @sample.split(/[\r\n]+/) @candidates = COMMON_DELIMITERS.map { |delim| [delim, @sample.scan(delim).count] }.to_h.select{|k,v| v > 0} end |
Instance Method Details
#find ⇒ Object
32 33 34 35 |
# File 'lib/idata/detector.rb', line 32 def find return DEFAULT_DELIMITER if @candidates.empty? # for example, file with only one header return find_same_occurence || find_valid || find_max_occurence || DEFAULT_DELIMITER end |
#find_max_occurence ⇒ Object
most occurence
67 68 69 70 71 72 |
# File 'lib/idata/detector.rb', line 67 def find_max_occurence selected = @candidates.select{|k,v| v == @candidates.sort_by(&:last).last }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end |
#find_same_occurence ⇒ Object
high confident level
53 54 55 56 57 58 59 60 61 62 63 64 |
# File 'lib/idata/detector.rb', line 53 def find_same_occurence selected = @candidates.select { |delim, count| begin CSV.parse(@sample, col_sep: delim).select{|e| !e.empty? }.map{|e| e.count}.uniq.count == 1 rescue Exception => ex false end }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end |
#find_valid ⇒ Object
just work
38 39 40 41 42 43 44 45 46 47 48 49 50 |
# File 'lib/idata/detector.rb', line 38 def find_valid selected = @candidates.select { |delim, count| begin CSV.parse(@sample, col_sep: delim) true rescue Exception => ex false end }.keys return selected.first if selected.count == 1 return DEFAULT_DELIMITER if selected.include?(DEFAULT_DELIMITER) end |