Class: SmarterCSV::Reader

Inherits:
Object
  • Object
show all
Includes:
AutoDetection, FileIO, HashTransformations, HeaderTransformations, HeaderValidations, Headers, Options, Parser
Defined in:
lib/smarter_csv/reader.rb

Constant Summary

Constants included from Options

Options::DEFAULT_OPTIONS

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from HashTransformations

#hash_transformations

Methods included from HeaderValidations

#check_duplicate_headers, #check_required_headers, #header_validations

Methods included from HeaderTransformations

#disambiguate_headers, #header_transformations, #remap_headers

Methods included from Headers

#process_headers

Methods included from Options

#process_options

Constructor Details

#initialize(input, given_options = {}) ⇒ Reader

first parameter: filename or input object which responds to readline method



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/smarter_csv/reader.rb', line 29

def initialize(input, given_options = {})
  @input = input
  @has_rails = !!defined?(Rails)
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
  @enforce_utf8 = false # only set to true if needed (after options parsing)
  @options = process_options(given_options)
  # true if it is compiled with accelleration
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
end

Instance Attribute Details

#chunk_countObject (readonly)

Returns the value of attribute chunk_count.



15
16
17
# File 'lib/smarter_csv/reader.rb', line 15

def chunk_count
  @chunk_count
end

#csv_line_countObject (readonly)

Returns the value of attribute csv_line_count.



15
16
17
# File 'lib/smarter_csv/reader.rb', line 15

def csv_line_count
  @csv_line_count
end

#enforce_utf8Object (readonly)

Returns the value of attribute enforce_utf8.



16
17
18
# File 'lib/smarter_csv/reader.rb', line 16

def enforce_utf8
  @enforce_utf8
end

#errorsObject (readonly)

Returns the value of attribute errors.



17
18
19
# File 'lib/smarter_csv/reader.rb', line 17

def errors
  @errors
end

#file_line_countObject (readonly)

Returns the value of attribute file_line_count.



15
16
17
# File 'lib/smarter_csv/reader.rb', line 15

def file_line_count
  @file_line_count
end

#has_accelerationObject (readonly)

Returns the value of attribute has_acceleration.



16
17
18
# File 'lib/smarter_csv/reader.rb', line 16

def has_acceleration
  @has_acceleration
end

#has_railsObject (readonly)

Returns the value of attribute has_rails.



16
17
18
# File 'lib/smarter_csv/reader.rb', line 16

def has_rails
  @has_rails
end

#headersObject (readonly)

Returns the value of attribute headers.



17
18
19
# File 'lib/smarter_csv/reader.rb', line 17

def headers
  @headers
end

#inputObject (readonly)

Returns the value of attribute input.



14
15
16
# File 'lib/smarter_csv/reader.rb', line 14

def input
  @input
end

#optionsObject (readonly)

Returns the value of attribute options.



14
15
16
# File 'lib/smarter_csv/reader.rb', line 14

def options
  @options
end

#raw_headerObject (readonly)

Returns the value of attribute raw_header.



17
18
19
# File 'lib/smarter_csv/reader.rb', line 17

def raw_header
  @raw_header
end

#resultObject (readonly)

Returns the value of attribute result.



17
18
19
# File 'lib/smarter_csv/reader.rb', line 17

def result
  @result
end

#warningsObject (readonly)

Returns the value of attribute warnings.



17
18
19
# File 'lib/smarter_csv/reader.rb', line 17

def warnings
  @warnings
end

Instance Method Details

#count_quote_chars(line, quote_char) ⇒ Object



217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
# File 'lib/smarter_csv/reader.rb', line 217

def count_quote_chars(line, quote_char)
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?

  count = 0
  escaped = false

  line.each_char do |char|
    if char == '\\' && !escaped
      escaped = true
    else
      count += 1 if char == quote_char && !escaped
      escaped = false
    end
  end

  count
end

#headerAObject

:nocov: rubocop:disable Naming/MethodName



21
22
23
24
# File 'lib/smarter_csv/reader.rb', line 21

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

#process(&block) ⇒ Object

rubocop:disable Lint/UnusedMethodArgument



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/smarter_csv/reader.rb', line 47

def process(&block) # rubocop:disable Lint/UnusedMethodArgument
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
  @verbose = options[:verbose]

  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    skip_lines(fh, options)

    # NOTE: we are no longer using header_size
    @headers, _header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    puts "Effective headers:\n#{pp(@headers)}\n" if @verbose

    header_validations(@headers, options)

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    # fh.each_line |line|
    until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = enforce_utf8_encoding(line, options) if @enforce_utf8

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters
      multiline = count_quote_chars(line, options[:quote_char]).odd?

      while multiline
        begin
          next_line = fh.readline(options[:row_sep])
          next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
          line += next_line
          @file_line_count += 1

          multiline = count_quote_chars(line, options[:quote_char]).odd?
        rescue EOFError
          # End of file reached. Check if quotes are balanced.
          total_quotes = count_quote_chars(line, options[:quote_char])
          if total_quotes.odd?
            raise MalformedCSV, "Unclosed quoted field detected in multiline data"
          else
            # Quotes are balanced; proceed without raising an error.
            break
          end
        end
      end

      # :nocov:
      if multiline && @verbose
        print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
      end
      # :nocov:

      line.chomp!(options[:row_sep])

      # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
      dataA, data_size = parse(line, options) # we parse the extra columns

      if options[:strict]
        raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
      else
        # we create additional columns on-the-fly
        current_size = @headers.size
        while current_size < data_size
          @headers << "#{options[:missing_header_prefix]}#{current_size + 1}".to_sym
          current_size += 1
        end
      end

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

      # --- HASH TRANSFORMATIONS ------------------------------------------------------------
      hash = @headers.zip(dataA).to_h

      hash = hash_transformations(hash, options)

      # --- HASH VALIDATIONS ----------------------------------------------------------------
      # will go here, and be able to:
      #  - validate correct format of the values for fields
      #  - required fields to be non-empty
      #  - ...
      # -------------------------------------------------------------------------------------

      next if options[:remove_empty_hashes] && hash.empty?

      puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
      # optional adding of csv_line_number to the hash to help debugging
      hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]

      # process the chunks or the resulting hash
      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk # do something with the hashes in the chunk in the block
          else
            @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
          end
          @chunk_count += 1
          chunk.clear # re-initialize for next chunk of data
        else
          # the last chunk may contain partial data, which is handled below
        end
        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash] # do something with the hash in the block (better to use chunking here)
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if @verbose

    # handling of last chunk:
    if !chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk # do something with the hashes in the chunk in the block
      else
        @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end