Class: Masticate::Mender

Inherits:
Base
  • Object
show all
Defined in:
lib/masticate/mender.rb

Overview

repair delimited input files

A row that contains fewer delimiters than expected has been split across two lines (due to a newline embedded in a field). Glue those two lines into a single line in the output.

Instance Attribute Summary

Attributes inherited from Base

#csv_options, #filename, #input, #input_count, #output, #output_count

Instance Method Summary collapse

Methods inherited from Base

#emit, #execute, #get, #initialize, #standard_options, #with_input

Constructor Details

This class inherits a constructor from Masticate::Base

Instance Method Details

#configure(opts) ⇒ Object



7
8
9
10
11
12
13
14
15
16
17
# File 'lib/masticate/mender.rb', line 7

def configure(opts)
  standard_options(opts)

  @inlined = opts[:inlined]
  @snip = opts[:snip]
  @dejunk = opts[:dejunk]
  @buried = opts[:buried]

  @expected_field_count = nil
  @holding = ''
end

#crunch(row, line = '', csv_options = {}) ⇒ Object



23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/masticate/mender.rb', line 23

def crunch(row, line = '', csv_options = {})
  if @inlined
    if row
      ncells = row.count/2-1
      if !@headers
        @headers = row[0..ncells]
        @expected_field_count = @headers.count
        emit(@headers)
      else
        if row[0..ncells] != @headers
          raise "Header mismatch on line #{@input_count}\n  Expected: #{@headers.join(',')}\n     Found: #{row[0..ncells].join(',')}"
        end
      end
      row = row[ncells+1, @expected_field_count]
    end
  elsif !@headers
    # trust the first row
    @headers = row
    case @snip
    when Fixnum
      @headers.shift(@snip)
    when String
      raise "TODO: snip named header. Multiple?"
    when nil
      # do nothing
    else
      raise "Do not understand snip instruction [#{@snip.inspect}]"
    end

    if @buried
      if @buried.is_a?(Fixnum) || @buried =~ /^\d+/
        @buried_index = @buried.to_i
      else
        @buried_index = row.index(@buried) or raise "Unable to find column '#{@buried}'"
      end
    end

    @expected_field_count = @headers.count
    row = @headers
  elsif row
    @holding << ' ' unless @holding.empty?
    @holding << line

    row = CSV.parse_line(@holding, csv_options) #.map {|s| s && s.strip}
    if row
      row = row.map {|s| s && s.strip}
    end

    if row.count < @expected_field_count
      # incomplete row; do not emit anything
      row = nil
    else
      @holding = ''
    end

    if @buried && (row.count > @expected_field_count)
      # buried delimiter
      # take the N+1th field and merge it onto the Nth field, moving up the remaining fields
      row[@buried_index] += row.delete_at(@buried_index + 1)
    end

    if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2
      # junky row, suppress output
      nil
    else
      row
    end
  end
end

#explode(line) ⇒ Object



97
98
99
# File 'lib/masticate/mender.rb', line 97

def explode(line)
  CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip}
end

#fieldcount(line) ⇒ Object



93
94
95
# File 'lib/masticate/mender.rb', line 93

def fieldcount(line)
  explode(line).count
end

#junky?(line) ⇒ Boolean

a line is “junky” if it has 2 or fewer fields with any content

Returns:

  • (Boolean)


102
103
104
# File 'lib/masticate/mender.rb', line 102

def junky?(line)
  explode(line).select {|s| s && !s.strip.empty?}.count <= 2
end

#mend(opts) ⇒ Object



19
20
21
# File 'lib/masticate/mender.rb', line 19

def mend(opts)
  execute(opts)
end