Class: Masticate::Mender
Overview
repair delimited input files
A row that contains fewer delimiters than expected has been split across two lines (due to a newline embedded in a field). Glue those two lines into a single line in the output.
Instance Attribute Summary
Attributes inherited from Base
#csv_options, #filename, #input, #input_count, #output, #output_count
Instance Method Summary collapse
- #configure(opts) ⇒ Object
- #crunch(row, line = '', csv_options = {}) ⇒ Object
- #explode(line) ⇒ Object
- #fieldcount(line) ⇒ Object
-
#junky?(line) ⇒ Boolean
a line is “junky” if it has 2 or fewer fields with any content.
- #mend(opts) ⇒ Object
Methods inherited from Base
#emit, #execute, #get, #initialize, #standard_options, #with_input
Constructor Details
This class inherits a constructor from Masticate::Base
Instance Method Details
#configure(opts) ⇒ Object
7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/masticate/mender.rb', line 7 def configure(opts) (opts) @inlined = opts[:inlined] @snip = opts[:snip] @dejunk = opts[:dejunk] @buried = opts[:buried] @expected_field_count = nil @holding = '' end |
#crunch(row, line = '', csv_options = {}) ⇒ Object
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/masticate/mender.rb', line 23 def crunch(row, line = '', = {}) if @inlined if row ncells = row.count/2-1 if !@headers @headers = row[0..ncells] @expected_field_count = @headers.count emit(@headers) else if row[0..ncells] != @headers raise "Header mismatch on line #{@input_count}\n Expected: #{@headers.join(',')}\n Found: #{row[0..ncells].join(',')}" end end row = row[ncells+1, @expected_field_count] end elsif !@headers # trust the first row @headers = row case @snip when Fixnum @headers.shift(@snip) when String raise "TODO: snip named header. Multiple?" when nil # do nothing else raise "Do not understand snip instruction [#{@snip.inspect}]" end if @buried if @buried.is_a?(Fixnum) || @buried =~ /^\d+/ @buried_index = @buried.to_i else @buried_index = row.index(@buried) or raise "Unable to find column '#{@buried}'" end end @expected_field_count = @headers.count row = @headers elsif row @holding << ' ' unless @holding.empty? @holding << line row = CSV.parse_line(@holding, ) #.map {|s| s && s.strip} if row row = row.map {|s| s && s.strip} end if row.count < @expected_field_count # incomplete row; do not emit anything row = nil else @holding = '' end if @buried && (row.count > @expected_field_count) # buried delimiter # take the N+1th field and merge it onto the Nth field, moving up the remaining fields row[@buried_index] += row.delete_at(@buried_index + 1) end if @dejunk && row && row.select {|s| s && !s.strip.empty?}.count <= 2 # junky row, suppress output nil else row end end end |
#explode(line) ⇒ Object
97 98 99 |
# File 'lib/masticate/mender.rb', line 97 def explode(line) CSV.parse_line(line, :col_sep => @col_sep, :quote_char => @quote_char).map {|s| s && s.strip} end |
#fieldcount(line) ⇒ Object
93 94 95 |
# File 'lib/masticate/mender.rb', line 93 def fieldcount(line) explode(line).count end |
#junky?(line) ⇒ Boolean
a line is “junky” if it has 2 or fewer fields with any content
102 103 104 |
# File 'lib/masticate/mender.rb', line 102 def junky?(line) explode(line).select {|s| s && !s.strip.empty?}.count <= 2 end |
#mend(opts) ⇒ Object
19 20 21 |
# File 'lib/masticate/mender.rb', line 19 def mend(opts) execute(opts) end |