Class: Utf8Sanitizer::UTF

Inherits:
Object
  • Object
show all
Defined in:
lib/utf8_sanitizer/utf.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(args = {}) ⇒ UTF

Returns a new instance of UTF.



8
9
10
11
12
13
14
15
16
# File 'lib/utf8_sanitizer/utf.rb', line 8

def initialize(args={})
  @valid_rows = []
  @encoded_rows = []
  @defective_rows = []
  @error_rows = []
  @headers = []
  @row_id = 0
  @data_hash = {}
end

Instance Attribute Details

#data_hashObject

Returns the value of attribute data_hash.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def data_hash
  @data_hash
end

#defective_rowsObject

Returns the value of attribute defective_rows.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def defective_rows
  @defective_rows
end

#encoded_rowsObject

Returns the value of attribute encoded_rows.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def encoded_rows
  @encoded_rows
end

#error_rowsObject

Returns the value of attribute error_rows.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def error_rows
  @error_rows
end

#headersObject

Returns the value of attribute headers.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def headers
  @headers
end

#row_idObject

Returns the value of attribute row_id.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def row_id
  @row_id
end

#valid_rowsObject

Returns the value of attribute valid_rows.



6
7
8
# File 'lib/utf8_sanitizer/utf.rb', line 6

def valid_rows
  @valid_rows
end

Instance Method Details

#check_utf(text) ⇒ Object

  • CHECK UTF * ####################



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/utf8_sanitizer/utf.rb', line 92

def check_utf(text)
  return if text.nil?
  results = { text: text, encoded: nil, wchar: nil, error: nil }

  begin
    if !text.valid_encoding?
      encoded = text.chars.select(&:valid_encoding?).join
      encoded.delete!('_')
      encoded = encoded.delete("^\u{0000}-\u{007F}")
    else
      encoded = text.delete("^\u{0000}-\u{007F}")
    end
    wchar = encoded&.gsub(/\s+/, ' ')&.strip
    results[:encoded] = encoded if text != encoded
    results[:wchar] = wchar if encoded != wchar
  rescue StandardError => error
    results[:error] = error.message if error
  end
  results
end

#compile_resultsObject

  • COMPILE RESULTS * ####################



19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# File 'lib/utf8_sanitizer/utf.rb', line 19

def compile_results
  utf_status = @valid_rows.map { |hsh| hsh[:utf_status] }
  mapped_details = utf_status.map { |str| str.split(', ') }.flatten.compact
  groups = make_groups_from_array(mapped_details)
  wchar = groups['wchar']
  perfect = groups['perfect']

  header_row_count = @headers.any? ? 1 : 0

  utf_result = {
    stats: { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar },
    data: { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows }
  }
  utf_result
end

#line_parse(validated_line) ⇒ Object

line_parse - helper VALIDATE HASHES ### Parses line to row, then updates final results.



79
80
81
82
83
84
85
86
87
88
89
# File 'lib/utf8_sanitizer/utf.rb', line 79

def line_parse(validated_line)
  return unless validated_line
  row = validated_line.split(',')
  return unless row.any?
  if @headers.empty?
    @headers = row
  else
    @data_hash.merge!(row_to_hsh(row))
    @valid_rows << @data_hash
  end
end

#make_groups_from_array(array) ⇒ Object



164
165
166
# File 'lib/utf8_sanitizer/utf.rb', line 164

def make_groups_from_array(array)
  array.each_with_object(Hash.new(0)) { |e, h| h[e] += 1; }
end

#process_hash_row(hsh) ⇒ Object

process_hash_row - helper VALIDATE HASHES ### Converts hash keys and vals into parsed line.



63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/utf8_sanitizer/utf.rb', line 63

def process_hash_row(hsh)
  if @headers.any?
    keys_or_values = hsh.values
    @row_id = hsh[:row_id]
  else
    keys_or_values = hsh.keys.map(&:to_s)
  end

  file_line = keys_or_values.join(',')
  validated_line = utf_filter(check_utf(file_line))
  res = line_parse(validated_line)
  res
end

#row_to_hsh(row) ⇒ Object

!! HELPERS BELOW !! ############# KEY VALUE CONVERTERS #############



159
160
161
162
# File 'lib/utf8_sanitizer/utf.rb', line 159

def row_to_hsh(row)
  h = Hash[@headers.zip(row)]
  h.symbolize_keys
end

#utf_filter(utf) ⇒ Object

  • UTF FILTER * ####################



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# File 'lib/utf8_sanitizer/utf.rb', line 114

def utf_filter(utf)
  return unless utf.present?
  # puts utf.inspect
  utf_status = utf.except(:text).compact.keys
  utf_status = utf_status&.map(&:to_s)&.join(', ')
  utf_status = 'perfect' if utf_status.blank?

  encoded = utf[:text] if utf[:encoded]
  error = utf[:error]
  line = utf.except(:error).compact.values.last unless error
  data_hash = { row_id: @row_id, utf_status: utf_status }

  @encoded_rows << { row_id: @row_id, text: encoded } if encoded
  @error_rows << { row_id: @row_id, text: error } if error
  @defective_rows << filt_utf_hsh[:text] if error
  @data_hash = data_hash if @data_hash[:row_id] != @row_id
  line
end

#validate_csv(file_path) ⇒ Object

  • VALIDATE CSV * ####################



135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'lib/utf8_sanitizer/utf.rb', line 135

def validate_csv(file_path)
  return unless file_path.present?
  File.open(file_path).each do |file_line|
    validated_line = utf_filter(check_utf(file_line))
    @row_id += 1
    if validated_line
      CSV.parse(validated_line) do |row|
        if @headers.empty?
          @headers = row
        else
          @data_hash.merge!(row_to_hsh(row))
          @valid_rows << @data_hash
        end
      end
    end
  rescue StandardError => error
    @error_rows << { row_id: @row_id, text: error.message }
  end
  utf_results = compile_results
end

#validate_data(args = {}) ⇒ Object

  • VALIDATE DATA * ####################



37
38
39
40
41
42
43
44
45
46
# File 'lib/utf8_sanitizer/utf.rb', line 37

def validate_data(args={})
  args = args.slice(:file_path, :data)
  args = args.compact
  file_path = args[:file_path]
  data = args[:data]

  utf_result = validate_csv(file_path) if file_path
  utf_result = validate_hashes(data) if data
  utf_result
end

#validate_hashes(orig_hashes) ⇒ Object

  • VALIDATE HASHES * ####################



49
50
51
52
53
54
55
56
57
58
59
# File 'lib/utf8_sanitizer/utf.rb', line 49

def validate_hashes(orig_hashes)
  return unless orig_hashes.present?
  begin
    process_hash_row(orig_hashes.first) ## keys for headers.
    orig_hashes.each { |hsh| process_hash_row(hsh) } ## values
  rescue StandardError => error
    @error_rows << { row_id: @row_id, text: error.message }
  end
  results = compile_results ## handles returns.
  results
end