Class: Masticate::Sniffer

Inherits:
Base
  • Object
show all
Defined in:
lib/masticate/sniffer.rb

Constant Summary collapse

CandidateDelimiters =
[',', '|', "\t", "~"]

Instance Attribute Summary collapse

Attributes inherited from Base

#csv_options, #filename, #input, #input_count, #output, #output_count

Instance Method Summary collapse

Methods inherited from Base

#emit, #execute, #get, #standard_options, #with_input

Constructor Details

#initialize(filename) ⇒ Sniffer

Returns a new instance of Sniffer.



9
10
11
# File 'lib/masticate/sniffer.rb', line 9

def initialize(filename)
  @filename = filename
end

Instance Attribute Details

#col_sepObject (readonly)

Returns the value of attribute col_sep.



4
5
6
# File 'lib/masticate/sniffer.rb', line 4

def col_sep
  @col_sep
end

#delimstatsObject (readonly)

Returns the value of attribute delimstats.



5
6
7
# File 'lib/masticate/sniffer.rb', line 5

def delimstats
  @delimstats
end

#quote_charObject (readonly)

Returns the value of attribute quote_char.



4
5
6
# File 'lib/masticate/sniffer.rb', line 4

def quote_char
  @quote_char
end

#statsObject (readonly)

Returns the value of attribute stats.



4
5
6
# File 'lib/masticate/sniffer.rb', line 4

def stats
  @stats
end

Instance Method Details

#consider_delim(line, delim) ⇒ Object



44
45
46
47
48
# File 'lib/masticate/sniffer.rb', line 44

def consider_delim(line, delim)
  @quote_char = nil
  n = count_fields(line, delim)
  [n, @quote_char]
end

#count_fields(line, delim) ⇒ Object



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# File 'lib/masticate/sniffer.rb', line 50

def count_fields(line, delim)
  if delim == ','
    straight_count = line.count(delim) + 1
    count_with_quoting = begin
      CSV.parse_line(line).count
    rescue CSV::MalformedCSVError
      # this is not valid CSV, e.g. has incorrectly embedded quotes
      0
    end
    if count_with_quoting < straight_count
      @quote_char = '"'
      count_with_quoting
    else
      straight_count
    end
  else
    line.count(delim) + 1
  end
end

#find_col_sepObject



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# File 'lib/masticate/sniffer.rb', line 25

def find_col_sep
  @delimstats = {}
  with_input do |input|
    lines = 10.times.map{get}.compact
    lines.each do |line|
      @line1 = line unless @line1

      CandidateDelimiters.each do |delim|
        delimstats[delim] ||= { :counts => Set.new, :quote_char => nil}
        h = delimstats[delim]
        fieldcount, quote_char = consider_delim(line, delim)
        h[:counts] << fieldcount
        h[:quote_char] ||= quote_char
      end
    end
  end
  delimstats.sort_by{|delim,stats| stats[:counts].max || 0}.last.first
end

#sniff(opts) ⇒ Object



13
14
15
16
17
18
19
20
21
22
23
# File 'lib/masticate/sniffer.rb', line 13

def sniff(opts)
  @col_sep = find_col_sep
  @quote_char = delimstats[@col_sep][:quote_char]
  @stats = stats if opts[:stats]
  {
    :col_sep => @col_sep,
    :quote_char => @quote_char,
    :field_counts => @stats,
    :headers => @line1.split(@col_sep).map(&:strip)
  }
end