Class: Censive

Inherits:
StringScanner
  • Object
show all
Defined in:
lib/censive.rb

Constant Summary collapse

VERSION =
"1.0.3"

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(str = nil, drop: false, encoding: nil, excel: false, mode: :compact, out: nil, quote: '"', relax: false, rowsep: "\n", sep: ",", strip: false, **opts) {|_self| ... } ⇒ Censive

Returns a new instance of Censive.

Yields:

  • (_self)

Yield Parameters:

  • _self (Censive)

    the object that the method was called on



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# File 'lib/censive.rb', line 48

def initialize(str=nil,
  drop:     false   , # drop trailing empty columns?
  encoding: nil     , # character encoding
  excel:    false   , # literals ="01" formulas =A1 + B2 http://bit.ly/3Y7jIvc
  mode:     :compact, # output mode: compact or full
  out:      nil     , # output stream, needs to respond to <<
  quote:    '"'     , # quote character
  relax:    false   , # relax quote parsing so ,"Fo"o, => ,"Fo""o",
  rowsep:   "\n"    , # row separator for output
  sep:      ","     , # column separator character
  strip:    false   , # strip columns when reading
  **opts              # grab bag
)
  # initialize data source
  if str && str.size < 100 && File.readable?(str)
    str = File.open(str, encoding ? "r:#{encoding}" : "r").read
  else
    str ||= ""
    str = str.encode(encoding) if encoding
  end
  super(str)
  reset

  # config options
  @cheat    = true
  @drop     = drop
  @encoding = str.encoding
  @excel    = excel
  @mode     = mode
  @out      = out || $stdout
  @relax    = relax
  @strip    = strip

  # config strings
  @quote    = quote
  @rowsep   = rowsep
  @sep      = sep

  # static strings
  @cr       = "\r"
  @lf       = "\n"
  @es       = ""
  @eq       = "="

  # combinations
  @esc      = (@quote * 2)
  @seq      = [@sep, @eq].join # used for parsing in excel mode

  # regexes
  xsep      = Regexp.escape(@sep) # may need to be escaped
  @eoc      = /(?=#{"\\" + xsep}|#{@cr}|#{@lf}|\z)/o # end of cell
  @eol      = /#{@cr}#{@lf}?|#{@lf}/o                # end of line
  @escapes  = /(#{@quote})|#{xsep}|#{@cr}|#{@lf}/o
  @quotable = /#{xsep}|#{@cr}|#{@lf}/o
  @quotes   = /#{@quote}/o
  @seps     = /#{xsep}+/o
  @quoted   = @excel ? /(?:=)?#{@quote}/o : @quote
  @unquoted = /[^#{xsep}#{@cr}#{@lf}][^#{@quote}#{@cr}#{@lf}]*/o
  @leadzero = /\A0\d*\z/

  yield self if block_given?
end

Instance Attribute Details

#encodingObject (readonly)

Returns the value of attribute encoding.



34
35
36
# File 'lib/censive.rb', line 34

def encoding
  @encoding
end

#outObject (readonly)

Returns the value of attribute out.



34
35
36
# File 'lib/censive.rb', line 34

def out
  @out
end

#rowsObject (readonly)

Returns the value of attribute rows.



34
35
36
# File 'lib/censive.rb', line 34

def rows
  @rows
end

Class Method Details

.parseObject



36
37
38
# File 'lib/censive.rb', line 36

def self.parse(...)
  new(...).parse
end

.writer(obj = nil, **opts, &code) ⇒ Object



40
41
42
43
44
45
46
# File 'lib/censive.rb', line 40

def self.writer(obj=nil, **opts, &code)
  case obj
  when String          then File.open(obj, "w") {|io| new(out: io, **opts, &code) }
  when StringIO,IO,nil then new(out: obj, **opts, &code)
  else abort "#{File.basename($0)}: invalid #{obj.class} object in writer"
  end
end

Instance Method Details

#<<(row) ⇒ Object

output a row



211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/censive.rb', line 211

def <<(row)

  # drop trailing empty columns
  row.pop while row.last.empty? if @drop

  s,q = @sep, @quote
  out = case @mode
  when :compact
    case @excel ? 2 : grok(row.join)
    when 0
      row
    when 1
      row.map do |col|
        col&.match?(@quotable) ? "#{q}#{col}#{q}" : col
      end
    else
      row.map do |col|
        @excel && col =~ @leadzero ? "=#{q}#{col}#{q}" :
        case grok(col)
        when 0 then col
        when 1 then "#{q}#{col}#{q}"
        else        "#{q}#{col.gsub(q, @esc)}#{q}"
        end
      end
    end
  when :full
    if @excel
      row.map do |col|
        col =~ @leadzero ? "=#{q}#{col}#{q}" : "#{q}#{col.gsub(q, @esc)}#{q}"
      end
    else
      row.map {|col| "#{q}#{col.gsub(q, @esc)}#{q}" }
    end
  end.join(s)

  @out << out + @rowsep
end

#bomb(msg) ⇒ Object



257
258
259
# File 'lib/censive.rb', line 257

def bomb(msg)
  abort "\n#{File.basename($0)}: #{msg} at character #{pos} near '#{string[pos-4,7]}'"
end

#eachObject



184
185
186
187
# File 'lib/censive.rb', line 184

def each
  @rows or parse
  @rows.each {|row| yield row }
end

#grok(str) ⇒ Object

returns 2 (must be quoted and escaped), 1 (must be quoted), 0 (neither)



202
203
204
205
206
207
208
# File 'lib/censive.rb', line 202

def grok(str)
  if idx = str&.index(@escapes)
    $1 ? 2 : str.index(@quote, idx) ? 2 : 1
  else
    0
  end
end

#next_rowObject



133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# File 'lib/censive.rb', line 133

def next_row
  if @cheat and line = scan_until(@eol)
    row = line.chomp!.split(@sep, -1)
    row.each do |col|
      next if (saw = col.count(@quote)).zero?
      next if (saw == 2) && col.delete_prefix!(@quote) && col.delete_suffix!(@quote)
      @cheat = false
      break
    end if line.include?(@quote)
    @cheat and return @strip ? row.each(&:strip!) : row
    unscan
  end

  token = next_token or return
  row = []
  row.push(*token)
  row.push(*token) while token = next_token
  row
end

#next_tokenObject



153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
# File 'lib/censive.rb', line 153

def next_token
  if scan(@quoted) # quoted cell
    token = ""
    while true
      token << (scan_until(@quotes) or bomb "unclosed quote")[0..-2]
      token << @quote and next if scan(@quote)
      scan(@eoc) and break
      @relax or bomb "invalid character after quote"
      token << @quote + (scan_until(@quotes) or bomb "bad inline quote")
      scan(@eoc) and break
    end
    scan(@sep)
    @strip ? token.strip : token
  elsif match = scan(@unquoted) # unquoted cell(s)
    if check(@quote) && !match.chomp!(@sep) # if we see a stray quote
      unless @excel && match.chomp!(@seq) # unless an excel literal, fix it
        match << (scan_until(@eoc) or bomb "stray quote")
        scan(@sep)
      end
    end
    tokens = match.split(@sep, -1)
    @strip ? tokens.map!(&:strip) : tokens
  elsif scan(@sep)
    match = scan(@seps)
    match ? match.split(@sep, -1) : @es
  else
    scan(@eol)
    nil
  end
end

#parseObject

[ Parser ]==



122
123
124
125
126
127
128
129
130
131
# File 'lib/censive.rb', line 122

def parse
  @rows = []
  while row = next_row
    @rows << row
    count = row.size
    @cols = count if count > @cols
    @cells += count
  end
  self
end

#reset(str = nil) ⇒ Object



111
112
113
114
115
116
117
118
# File 'lib/censive.rb', line 111

def reset(str=nil)
  @rows = nil
  @cols = @cells = 0

  self.string = str if str
  @encoding = string.encoding
  super()
end

#statsObject



249
250
251
252
253
254
255
# File 'lib/censive.rb', line 249

def stats
  wide = string.size.to_s.size
  puts "%#{wide}d rows"    % @rows.size
  puts "%#{wide}d columns" % @cols
  puts "%#{wide}d cells"   % @cells
  puts "%#{wide}d bytes"   % string.size
end

#to_csv(*args, **opts, &code) ⇒ Object



189
190
191
192
193
194
195
196
197
# File 'lib/censive.rb', line 189

def to_csv(*args, **opts, &code)
  if args.empty? && opts.empty?
    block_given? ? each(&code) : each {|row| @out << row }
  elsif block_given?
    Censive.writer(*args, **opts, &code)
  else
    Censive.writer(*args, **opts) {|csv| each {|row| csv << row }}
  end
end