Class: CsvUtils
- Inherits:
-
Object
- Object
- CsvUtils
- Defined in:
- lib/csvutils/version.rb,
lib/csvutils/cut.rb,
lib/csvutils/head.rb,
lib/csvutils/stat.rb,
lib/csvutils/test.rb,
lib/csvutils/split.rb,
lib/csvutils/utils.rb,
lib/csvutils/header.rb
Overview
note: for now CsvUtils is a class!!! NOT a module - change - why? why not?
Constant Summary collapse
- MAJOR =
todo: namespace inside version or something - why? why not??
0
- MINOR =
3
- PATCH =
0
- VERSION =
[MAJOR,MINOR,PATCH].join('.')
Class Method Summary collapse
- .banner ⇒ Object
-
.csv_row(*values, sep: ',') ⇒ Object
(simple) helper for “csv-encoding” values / row.
- .cut(path, *columns, output: path, sep: ',') ⇒ Object
-
.head(path, sep: ',', n: 4) ⇒ Object
test or dry run to check if rows can get read/scanned.
-
.header(path, sep: ',', debug: false) ⇒ Object
use header or headers - or use both (with alias)?.
-
.pp_header(headers) ⇒ Object
check: rename to print_headers or prettyprint_header - why? why not?.
- .root ⇒ Object
- .split(path, *columns, sep: ',', &blk) ⇒ Object
- .split_write(inpath, values, chunk, sep:) ⇒ Object
- .stat(path, *columns, sep: ',', debug: false) ⇒ Object
-
.test(path, sep: ',') ⇒ Object
test or dry run to check if rows can get read/scanned.
- .version ⇒ Object
Class Method Details
.banner ⇒ Object
16 17 18 |
# File 'lib/csvutils/version.rb', line 16 def self. "csvutils/#{VERSION} on Ruby #{RUBY_VERSION} (#{RUBY_RELEASE_DATE}) [#{RUBY_PLATFORM}]" end |
.csv_row(*values, sep: ',') ⇒ Object
(simple) helper for “csv-encoding” values / row
todo: check for newline in value too? why? why not?
18 19 20 21 22 23 24 25 26 27 |
# File 'lib/csvutils/utils.rb', line 18 def self.csv_row( *values, sep: ',' ) values.map do |value| if value && (value.index( sep ) || value.index('"')) ## double quotes and enclose in double qoutes value = %Q{"#{value.gsub('"', '""')}"} else value end end end |
.cut(path, *columns, output: path, sep: ',') ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/csvutils/cut.rb', line 8 def self.cut( path, *columns, output: path, sep: ',' ) inpath = path outpath = output # note: output defaults to inpath (overwrites datafile in-place!!!) puts "cvscut in: >#{inpath}< out: >#{outpath}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns = { sep: sep } recs = CsvHash.read( inpath, ) ## for convenience - make sure parent folders/directories exist FileUtils.mkdir_p( File.dirname( outpath )) unless Dir.exists?( File.dirname( outpath )) ## note: ## todo/fix: add two trailing spaces for pretty printing - why? why not? File.open( outpath, 'w:utf-8' ) do |out| out << csv_row( *columns, sep: sep ).join( sep ) ## for row add headers/columns out << "\n" recs.each do |rec| values = columns.map { |col| rec[col] } ## find data for column out << csv_row( *values, sep: sep ).join( sep ) out << "\n" end end puts 'Done.' end |
.head(path, sep: ',', n: 4) ⇒ Object
test or dry run to check if rows can get read/scanned
7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
# File 'lib/csvutils/head.rb', line 7 def self.head( path, sep: ',', n: 4 ) i = 0 = { sep: sep } CsvHash.foreach( path, ) do |rec| i += 1 pp rec break if i >= n end puts " #{i} records" end |
.header(path, sep: ',', debug: false) ⇒ Object
use header or headers - or use both (with alias)?
6 7 8 9 10 11 12 13 14 |
# File 'lib/csvutils/header.rb', line 6 def self.header( path, sep: ',', debug: false ) ## use header or headers - or use both (with alias)? row = CsvReader.header( path, sep: sep ) pp row if debug ## e.g.: # "Country,League,Season,Date,Time,Home,Away,HG,AG,Res,PH,PD,PA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA\n" row end |
.pp_header(headers) ⇒ Object
check: rename to print_headers or prettyprint_header - why? why not?
6 7 8 9 10 11 |
# File 'lib/csvutils/utils.rb', line 6 def self.pp_header( headers ) ## check: rename to print_headers or prettyprint_header - why? why not? puts "#{headers.size} columns:" headers.each_with_index do |header,i| puts " #{i+1}: #{header}" end end |
.root ⇒ Object
20 21 22 |
# File 'lib/csvutils/version.rb', line 20 def self.root File.( File.dirname(File.dirname(File.dirname(__FILE__))) ) end |
.split(path, *columns, sep: ',', &blk) ⇒ Object
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
# File 'lib/csvutils/split.rb', line 8 def self.split( path, *columns, sep: ',', &blk ) puts "cvssplit in: >#{path}<" ## ["Date", "HomeTeam", "AwayTeam", "FTHG", "FTAG", "HTHG", "HTAG"] puts "columns:" pp columns ## note: do NOT use headers ## for easy sorting use "plain" array of array for records = { sep: sep } data = CsvReader.read( path, ) ## todo/check: (auto-) strip (remove all leading and trailing spaces) ## from all values - why? why not? ## check if CSV.parse has an option for it? headers = data.shift ## remove top array item (that is, row with headers) header_mapping = {} headers.each_with_index { | header,i | header_mapping[header]=i } pp header_mapping ## map columns to array indices e.g. ['Season', 'Div'] => [1,2] column_indices = columns.map { |col| header_mapping[col] } pp column_indices ################################################### ## note: sort data by columns (before split) data = data.sort do |row1,row2| res = 0 column_indices.each do |col| res = row1[col] <=> row2[col] if res == 0 end res end chunk = [] data.each_with_index do |row,i| chunk << row next_row = data[i+1] changed = false if next_row.nil? ## end-of-file changed = true else column_indices.each do |col| if row[col] != next_row[col] changed = true break ## out of each column_indices loop end end end if changed puts "save new chunk:" column_values = column_indices.map {|col| row[col] } pp column_values # note: add header(s) row upfront (as first row) to chunk (with unshift) chunk_with_headers = chunk.unshift( headers ) if blk yield( column_values, chunk_with_headers ) else ## auto-save (write-to-file) by default - why? why not? split_write( path, column_values, chunk_with_headers, sep: sep ) end chunk = [] ## reset chunk for next batch of records end end puts 'Done.' end |
.split_write(inpath, values, chunk, sep:) ⇒ Object
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/csvutils/split.rb', line 87 def self.split_write( inpath, values, chunk, sep: ) basename = File.basename( inpath, '.*' ) dirname = File.dirname( inpath ) ## check/change invalid filename chars ## e.g. change 1990/91 to 1990-91 extraname = values.map {|value| value.tr('/','-')}.join('~') outpath = "#{dirname}/#{basename}_#{extraname}.csv" puts "saving >#{basename}_#{extraname}.csv<..." File.open( outpath, 'w:utf-8' ) do |out| chunk.each do |row| out << csv_row( *row, sep: sep ).join( sep ) out << "\n" end end end |
.stat(path, *columns, sep: ',', debug: false) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# File 'lib/csvutils/stat.rb', line 6 def self.stat( path, *columns, sep: ',', debug: false ) = { sep: sep } values = {} nulls = {} # check 1) nulls/nils (e.g. empty strings ""), # 2) not/appliation or available n/a NA or NaN or ... # 3) missing - e.g. ? i=0 CsvHash.foreach( path, ) do |rec| i += 1 pp rec if i == 1 && debug print '.' if i % 100 == 0 ## collect unique values for passed in columns columns.each do |col| value = rec[col] ## note: value might be nil!!!!! values[col] ||= Hash.new(0) values[col][ value ? value : '<nil>' ] +=1 end ## alway track nulls - why? why not rec.each do |col,value| ## if value.nil? ## todo/check - nil value possible (not always empty string - why? why not?) ## puts "[debug] nil value in row:" ## puts "#{col} = #{value.inspect} : #{value.class.name}" ## end if value.nil? nulls[col] ||= Hash.new(0) nulls[col]['nil'] +=1 elsif value.empty? nulls[col] ||= Hash.new(0) nulls[col]['empty'] +=1 elsif ['na', 'n/a', '-'].include?( value.downcase ) nulls[col] ||= Hash.new(0) nulls[col]['na'] +=1 elsif value == '?' ## check for (?) e.g. value.include?( '(?)') - why? why not? nulls[col] ||= Hash.new(0) nulls[col]['?'] +=1 else # do nothing; "regular" value end end end puts " #{i} rows" puts puts " nils/nulls :: empty strings :: na / n/a / undefined :: missing (?):" puts " #{nulls.inspect}" puts ## dump headers first (first row with names of columns) headers = header( path, sep: sep, debug: debug ) pp_header( headers ) ## pretty print header columns puts if values.any? ## pretty print (pp) / dump unique values for passed in columns values.each do |col,h| puts " column >#{col}< #{h.size} unique values:" ## sort by name/value for now (not frequency) - change - why? why not? sorted_values = h.to_a.sort {|l,r| l[0] <=> r[0] } sorted_values.each do |rec| puts " #{rec[1]} x #{rec[0]}" end end end end |
.test(path, sep: ',') ⇒ Object
test or dry run to check if rows can get read/scanned
7 8 9 10 11 12 13 14 15 16 17 |
# File 'lib/csvutils/test.rb', line 7 def self.test( path, sep: ',' ) i = 0 = { sep: sep } CsvHash.foreach( path, ) do |rec| i += 1 print '.' if i % 100 == 0 end puts " #{i} rows" end |
.version ⇒ Object
12 13 14 |
# File 'lib/csvutils/version.rb', line 12 def self.version VERSION end |