Class: SmarterCSV::Reader

Inherits:

Object

Object
SmarterCSV::Reader

show all

Includes:: AutoDetection, FileIO, HashTransformations, HeaderTransformations, HeaderValidations, Headers, Options, Parser

Defined in:: lib/smarter_csv/reader.rb

Constant Summary

Constants included from Options

Options::DEFAULT_OPTIONS

Instance Attribute Summary collapse

#chunk_count ⇒ Object readonly

Returns the value of attribute chunk_count.
#csv_line_count ⇒ Object readonly

Returns the value of attribute csv_line_count.
#enforce_utf8 ⇒ Object readonly

Returns the value of attribute enforce_utf8.
#errors ⇒ Object readonly

Returns the value of attribute errors.
#file_line_count ⇒ Object readonly

Returns the value of attribute file_line_count.
#has_acceleration ⇒ Object readonly

Returns the value of attribute has_acceleration.
#has_rails ⇒ Object readonly

Returns the value of attribute has_rails.
#headers ⇒ Object readonly

Returns the value of attribute headers.
#input ⇒ Object readonly

Returns the value of attribute input.
#options ⇒ Object readonly

Returns the value of attribute options.
#raw_header ⇒ Object readonly

Returns the value of attribute raw_header.
#result ⇒ Object readonly

Returns the value of attribute result.
#warnings ⇒ Object readonly

Returns the value of attribute warnings.

Instance Method Summary collapse

#count_quote_chars(line, quote_char) ⇒ Object
#headerA ⇒ Object

:nocov: rubocop:disable Naming/MethodName.
#initialize(input, given_options = {}) ⇒ Reader constructor

first parameter: filename or input object which responds to readline method.
#process(&block) ⇒ Object

rubocop:disable Lint/UnusedMethodArgument.

Constructor Details

#initialize(input, given_options = {}) ⇒ `Reader`

first parameter: filename or input object which responds to readline method

# File 'lib/smarter_csv/reader.rb', line 29

def initialize(input, given_options = {})
  @input = input
  @has_rails = !!defined?(Rails)
  @csv_line_count = 0
  @chunk_count = 0
  @errors = {}
  @file_line_count = 0
  @headerA = []
  @headers = nil
  @raw_header = nil # header as it appears in the file
  @result = []
  @warnings = {}
  @enforce_utf8 = false # only set to true if needed (after options parsing)
  @options = process_options(given_options)
  # true if it is compiled with accelleration
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
end

Instance Attribute Details

#chunk_count ⇒ `Object` (readonly)

Returns the value of attribute chunk_count.



15
16
17

# File 'lib/smarter_csv/reader.rb', line 15

def chunk_count
  @chunk_count
end

#csv_line_count ⇒ `Object` (readonly)

Returns the value of attribute csv_line_count.



15
16
17

# File 'lib/smarter_csv/reader.rb', line 15

def csv_line_count
  @csv_line_count
end

#enforce_utf8 ⇒ `Object` (readonly)

Returns the value of attribute enforce_utf8.



16
17
18

# File 'lib/smarter_csv/reader.rb', line 16

def enforce_utf8
  @enforce_utf8
end

#errors ⇒ `Object` (readonly)

Returns the value of attribute errors.



17
18
19

# File 'lib/smarter_csv/reader.rb', line 17

def errors
  @errors
end

#file_line_count ⇒ `Object` (readonly)

Returns the value of attribute file_line_count.



15
16
17

# File 'lib/smarter_csv/reader.rb', line 15

def file_line_count
  @file_line_count
end

#has_acceleration ⇒ `Object` (readonly)

Returns the value of attribute has_acceleration.



16
17
18

# File 'lib/smarter_csv/reader.rb', line 16

def has_acceleration
  @has_acceleration
end

#has_rails ⇒ `Object` (readonly)

Returns the value of attribute has_rails.



16
17
18

# File 'lib/smarter_csv/reader.rb', line 16

def has_rails
  @has_rails
end

#headers ⇒ `Object` (readonly)

Returns the value of attribute headers.



17
18
19

# File 'lib/smarter_csv/reader.rb', line 17

def headers
  @headers
end

#input ⇒ `Object` (readonly)

Returns the value of attribute input.



14
15
16

# File 'lib/smarter_csv/reader.rb', line 14

def input
  @input
end

#options ⇒ `Object` (readonly)

Returns the value of attribute options.



14
15
16

# File 'lib/smarter_csv/reader.rb', line 14

def options
  @options
end

#raw_header ⇒ `Object` (readonly)

Returns the value of attribute raw_header.



17
18
19

# File 'lib/smarter_csv/reader.rb', line 17

def raw_header
  @raw_header
end

#result ⇒ `Object` (readonly)

Returns the value of attribute result.



17
18
19

# File 'lib/smarter_csv/reader.rb', line 17

def result
  @result
end

#warnings ⇒ `Object` (readonly)

Returns the value of attribute warnings.



17
18
19

# File 'lib/smarter_csv/reader.rb', line 17

def warnings
  @warnings
end

Instance Method Details

#count_quote_chars(line, quote_char) ⇒ `Object`

# File 'lib/smarter_csv/reader.rb', line 217

def count_quote_chars(line, quote_char)
  return 0 if line.nil? || quote_char.nil? || quote_char.empty?

  count = 0
  escaped = false

  line.each_char do |char|
    if char == '\\' && !escaped
      escaped = true
    else
      count += 1 if char == quote_char && !escaped
      escaped = false
    end
  end

  count
end

#headerA ⇒ `Object`

:nocov: rubocop:disable Naming/MethodName

# File 'lib/smarter_csv/reader.rb', line 21

def headerA
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
  @headerA
end

#process(&block) ⇒ `Object`

rubocop:disable Lint/UnusedMethodArgument

# File 'lib/smarter_csv/reader.rb', line 47

def process(&block) # rubocop:disable Lint/UnusedMethodArgument
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
  @verbose = options[:verbose]

  begin
    fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")

    if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
      puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
    end

    # auto-detect the row separator
    options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
    # attempt to auto-detect column separator
    options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto

    skip_lines(fh, options)

    # NOTE: we are no longer using header_size
    @headers, _header_size = process_headers(fh, options)
    @headerA = @headers # @headerA is deprecated, use @headers

    puts "Effective headers:\n#{pp(@headers)}\n" if @verbose

    header_validations(@headers, options)

    # in case we use chunking.. we'll need to set it up..
    if options[:chunk_size].to_i > 0
      use_chunks = true
      chunk_size = options[:chunk_size].to_i
      @chunk_count = 0
      chunk = []
    else
      use_chunks = false
    end

    # now on to processing all the rest of the lines in the CSV file:
    # fh.each_line |line|
    until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
      line = readline_with_counts(fh, options)

      # replace invalid byte sequence in UTF-8 with question mark to avoid errors
      line = enforce_utf8_encoding(line, options) if @enforce_utf8

      print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose

      next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any

      # cater for the quoted csv data containing the row separator carriage return character
      # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
      # by detecting the existence of an uneven number of quote characters
      multiline = count_quote_chars(line, options[:quote_char]).odd?

      while multiline
        begin
          next_line = fh.readline(options[:row_sep])
          next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
          line += next_line
          @file_line_count += 1

          multiline = count_quote_chars(line, options[:quote_char]).odd?
        rescue EOFError
          # End of file reached. Check if quotes are balanced.
          total_quotes = count_quote_chars(line, options[:quote_char])
          if total_quotes.odd?
            raise MalformedCSV, "Unclosed quoted field detected in multiline data"
          else
            # Quotes are balanced; proceed without raising an error.
            break
          end
        end
      end

      # :nocov:
      if multiline && @verbose
        print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
      end
      # :nocov:

      line.chomp!(options[:row_sep])

      # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
      dataA, data_size = parse(line, options) # we parse the extra columns

      if options[:strict]
        raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
      else
        # we create additional columns on-the-fly
        current_size = @headers.size
        while current_size < data_size
          @headers << "#{options[:missing_header_prefix]}#{current_size + 1}".to_sym
          current_size += 1
        end
      end

      dataA.map!{|x| x.strip} if options[:strip_whitespace]

      # if all values are blank, then ignore this line
      next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

      # --- HASH TRANSFORMATIONS ------------------------------------------------------------
      hash = @headers.zip(dataA).to_h

      hash = hash_transformations(hash, options)

      # --- HASH VALIDATIONS ----------------------------------------------------------------
      # will go here, and be able to:
      #  - validate correct format of the values for fields
      #  - required fields to be non-empty
      #  - ...
      # -------------------------------------------------------------------------------------

      next if options[:remove_empty_hashes] && hash.empty?

      puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
      # optional adding of csv_line_number to the hash to help debugging
      hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]

      # process the chunks or the resulting hash
      if use_chunks
        chunk << hash # append temp result to chunk

        if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
          # do something with the chunk
          if block_given?
            yield chunk # do something with the hashes in the chunk in the block
          else
            @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
          end
          @chunk_count += 1
          chunk.clear # re-initialize for next chunk of data
        else
          # the last chunk may contain partial data, which is handled below
        end
        # while a chunk is being filled up we don't need to do anything else here

      else # no chunk handling
        if block_given?
          yield [hash] # do something with the hash in the block (better to use chunking here)
        else
          @result << hash
        end
      end
    end

    # print new line to retain last processing line message
    print "\n" if @verbose

    # handling of last chunk:
    if !chunk.nil? && chunk.size > 0
      # do something with the chunk
      if block_given?
        yield chunk # do something with the hashes in the chunk in the block
      else
        @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
      end
      @chunk_count += 1
      # chunk = [] # initialize for next chunk of data
    end
  ensure
    fh.close if fh.respond_to?(:close)
  end

  if block_given?
    @chunk_count # when we do processing through a block we only care how many chunks we processed
  else
    @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
  end
end

Class: SmarterCSV::Reader

Constant Summary

Constants included from Options

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from HashTransformations

Methods included from HeaderValidations

Methods included from HeaderTransformations

Methods included from Headers

Methods included from Options

Constructor Details

#initialize(input, given_options = {}) ⇒ Reader

Instance Attribute Details

#chunk_count ⇒ Object (readonly)

#csv_line_count ⇒ Object (readonly)

#enforce_utf8 ⇒ Object (readonly)

#errors ⇒ Object (readonly)

#file_line_count ⇒ Object (readonly)

#has_acceleration ⇒ Object (readonly)

#has_rails ⇒ Object (readonly)

#headers ⇒ Object (readonly)

#input ⇒ Object (readonly)

#options ⇒ Object (readonly)

#raw_header ⇒ Object (readonly)

#result ⇒ Object (readonly)

#warnings ⇒ Object (readonly)