Class: Utf8Sanitizer::UTF
- Inherits:
-
Object
- Object
- Utf8Sanitizer::UTF
- Defined in:
- lib/utf8_sanitizer/utf.rb
Instance Attribute Summary collapse
-
#data_hash ⇒ Object
Returns the value of attribute data_hash.
-
#defective_rows ⇒ Object
Returns the value of attribute defective_rows.
-
#encoded_rows ⇒ Object
Returns the value of attribute encoded_rows.
-
#error_rows ⇒ Object
Returns the value of attribute error_rows.
-
#headers ⇒ Object
Returns the value of attribute headers.
-
#row_id ⇒ Object
Returns the value of attribute row_id.
-
#valid_rows ⇒ Object
Returns the value of attribute valid_rows.
Instance Method Summary collapse
-
#check_utf(text) ⇒ Object
-
CHECK UTF * ####################.
-
-
#compile_results ⇒ Object
-
COMPILE RESULTS * ####################.
-
-
#initialize(args = {}) ⇒ UTF
constructor
A new instance of UTF.
-
#line_parse(validated_line) ⇒ Object
line_parse - helper VALIDATE HASHES ### Parses line to row, then updates final results.
- #make_groups_from_array(array) ⇒ Object
-
#process_hash_row(hsh) ⇒ Object
process_hash_row - helper VALIDATE HASHES ### Converts hash keys and vals into parsed line.
-
#row_to_hsh(row) ⇒ Object
!! HELPERS BELOW !! ############# KEY VALUE CONVERTERS #############.
-
#utf_filter(utf) ⇒ Object
-
UTF FILTER * ####################.
-
-
#validate_csv(file_path) ⇒ Object
-
VALIDATE CSV * ####################.
-
-
#validate_data(args = {}) ⇒ Object
-
VALIDATE DATA * ####################.
-
-
#validate_hashes(orig_hashes) ⇒ Object
-
VALIDATE HASHES * ####################.
-
Constructor Details
#initialize(args = {}) ⇒ UTF
Returns a new instance of UTF.
8 9 10 11 12 13 14 15 16 |
# File 'lib/utf8_sanitizer/utf.rb', line 8 def initialize(args={}) @valid_rows = [] @encoded_rows = [] @defective_rows = [] @error_rows = [] @headers = [] @row_id = 0 @data_hash = {} end |
Instance Attribute Details
#data_hash ⇒ Object
Returns the value of attribute data_hash.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def data_hash @data_hash end |
#defective_rows ⇒ Object
Returns the value of attribute defective_rows.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def defective_rows @defective_rows end |
#encoded_rows ⇒ Object
Returns the value of attribute encoded_rows.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def encoded_rows @encoded_rows end |
#error_rows ⇒ Object
Returns the value of attribute error_rows.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def error_rows @error_rows end |
#headers ⇒ Object
Returns the value of attribute headers.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def headers @headers end |
#row_id ⇒ Object
Returns the value of attribute row_id.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def row_id @row_id end |
#valid_rows ⇒ Object
Returns the value of attribute valid_rows.
6 7 8 |
# File 'lib/utf8_sanitizer/utf.rb', line 6 def valid_rows @valid_rows end |
Instance Method Details
#check_utf(text) ⇒ Object
-
CHECK UTF * ####################
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
# File 'lib/utf8_sanitizer/utf.rb', line 92 def check_utf(text) return if text.nil? results = { text: text, encoded: nil, wchar: nil, error: nil } begin if !text.valid_encoding? encoded = text.chars.select(&:valid_encoding?).join encoded.delete!('_') encoded = encoded.delete("^\u{0000}-\u{007F}") else encoded = text.delete("^\u{0000}-\u{007F}") end wchar = encoded&.gsub(/\s+/, ' ')&.strip results[:encoded] = encoded if text != encoded results[:wchar] = wchar if encoded != wchar rescue StandardError => error results[:error] = error. if error end results end |
#compile_results ⇒ Object
-
COMPILE RESULTS * ####################
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
# File 'lib/utf8_sanitizer/utf.rb', line 19 def compile_results utf_status = @valid_rows.map { |hsh| hsh[:utf_status] } mapped_details = utf_status.map { |str| str.split(', ') }.flatten.compact groups = make_groups_from_array(mapped_details) wchar = groups['wchar'] perfect = groups['perfect'] header_row_count = @headers.any? ? 1 : 0 utf_result = { stats: { total_rows: @row_id, header_row: header_row_count, valid_rows: @valid_rows.count, error_rows: @error_rows.count, defective_rows: @defective_rows.count, perfect_rows: perfect, encoded_rows: @encoded_rows.count, wchar_rows: wchar }, data: { valid_data: @valid_rows, encoded_data: @encoded_rows, defective_data: @defective_rows, error_data: @error_rows } } utf_result end |
#line_parse(validated_line) ⇒ Object
line_parse - helper VALIDATE HASHES ### Parses line to row, then updates final results.
79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/utf8_sanitizer/utf.rb', line 79 def line_parse(validated_line) return unless validated_line row = validated_line.split(',') return unless row.any? if @headers.empty? @headers = row else @data_hash.merge!(row_to_hsh(row)) @valid_rows << @data_hash end end |
#make_groups_from_array(array) ⇒ Object
164 165 166 |
# File 'lib/utf8_sanitizer/utf.rb', line 164 def make_groups_from_array(array) array.each_with_object(Hash.new(0)) { |e, h| h[e] += 1; } end |
#process_hash_row(hsh) ⇒ Object
process_hash_row - helper VALIDATE HASHES ### Converts hash keys and vals into parsed line.
63 64 65 66 67 68 69 70 71 72 73 74 75 |
# File 'lib/utf8_sanitizer/utf.rb', line 63 def process_hash_row(hsh) if @headers.any? keys_or_values = hsh.values @row_id = hsh[:row_id] else keys_or_values = hsh.keys.map(&:to_s) end file_line = keys_or_values.join(',') validated_line = utf_filter(check_utf(file_line)) res = line_parse(validated_line) res end |
#row_to_hsh(row) ⇒ Object
!! HELPERS BELOW !! ############# KEY VALUE CONVERTERS #############
159 160 161 162 |
# File 'lib/utf8_sanitizer/utf.rb', line 159 def row_to_hsh(row) h = Hash[@headers.zip(row)] h.symbolize_keys end |
#utf_filter(utf) ⇒ Object
-
UTF FILTER * ####################
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
# File 'lib/utf8_sanitizer/utf.rb', line 114 def utf_filter(utf) return unless utf.present? # puts utf.inspect utf_status = utf.except(:text).compact.keys utf_status = utf_status&.map(&:to_s)&.join(', ') utf_status = 'perfect' if utf_status.blank? encoded = utf[:text] if utf[:encoded] error = utf[:error] line = utf.except(:error).compact.values.last unless error data_hash = { row_id: @row_id, utf_status: utf_status } @encoded_rows << { row_id: @row_id, text: encoded } if encoded @error_rows << { row_id: @row_id, text: error } if error @defective_rows << filt_utf_hsh[:text] if error @data_hash = data_hash if @data_hash[:row_id] != @row_id line end |
#validate_csv(file_path) ⇒ Object
-
VALIDATE CSV * ####################
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# File 'lib/utf8_sanitizer/utf.rb', line 135 def validate_csv(file_path) return unless file_path.present? File.open(file_path).each do |file_line| validated_line = utf_filter(check_utf(file_line)) @row_id += 1 if validated_line CSV.parse(validated_line) do |row| if @headers.empty? @headers = row else @data_hash.merge!(row_to_hsh(row)) @valid_rows << @data_hash end end end rescue StandardError => error @error_rows << { row_id: @row_id, text: error. } end utf_results = compile_results end |
#validate_data(args = {}) ⇒ Object
-
VALIDATE DATA * ####################
37 38 39 40 41 42 43 44 45 46 |
# File 'lib/utf8_sanitizer/utf.rb', line 37 def validate_data(args={}) args = args.slice(:file_path, :data) args = args.compact file_path = args[:file_path] data = args[:data] utf_result = validate_csv(file_path) if file_path utf_result = validate_hashes(data) if data utf_result end |
#validate_hashes(orig_hashes) ⇒ Object
-
VALIDATE HASHES * ####################
49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/utf8_sanitizer/utf.rb', line 49 def validate_hashes(orig_hashes) return unless orig_hashes.present? begin process_hash_row(orig_hashes.first) ## keys for headers. orig_hashes.each { |hsh| process_hash_row(hsh) } ## values rescue StandardError => error @error_rows << { row_id: @row_id, text: error. } end results = compile_results ## handles returns. results end |