Class: TxmlImporter::Txml

Inherits:
Object
  • Object
show all
Defined in:
lib/txml_importer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(file_path:, **args) ⇒ Txml

Returns a new instance of Txml.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/txml_importer.rb', line 10

def initialize(file_path:, **args)
  @file_path = file_path
  @content = File.read(open(@file_path)) if !args[:encoding].eql?('UTF-8')
  if args[:encoding].nil?
    @encoding = CharlockHolmes::EncodingDetector.detect(@content[0..100_000])[:encoding]
    if @encoding.nil?
      encoding_in_file = @content.dup.force_encoding('utf-8').scrub!("*").gsub!(/\0/, '').scan(/(?<=encoding=").*(?=")/)[0].upcase
      if encoding_in_file.eql?('UTF-8')
        @encoding = ('UTF-8')
      elsif encoding_in_file.eql?('UTF-16')
        @encoding = ('UTF-16LE')
      end
    end
  else
    @encoding = args[:encoding].upcase
  end
  @doc = {
    source_language: "",
    tu: { id: "", counter: 0, vals: [] },
    seg: { counter: 0, vals: [] },
    language_pairs: []
  }
  raise "Encoding type could not be determined. Please set an encoding of UTF-8, UTF-16LE, or UTF-16BE" if @encoding.nil?
  raise "Encoding type not supported. Please choose an encoding of UTF-8, UTF-16LE, or UTF-16BE" unless @encoding.eql?('UTF-8') || @encoding.eql?('UTF-16LE') || @encoding.eql?('UTF-16BE')
  @text = CharlockHolmes::Converter.convert(@content, @encoding, 'UTF-8') if !@encoding.eql?('UTF-8')
end

Instance Attribute Details

#encodingObject (readonly)

Returns the value of attribute encoding.



9
10
11
# File 'lib/txml_importer.rb', line 9

def encoding
  @encoding
end

#file_pathObject (readonly)

Returns the value of attribute file_path.



9
10
11
# File 'lib/txml_importer.rb', line 9

def file_path
  @file_path
end

Instance Method Details

#importObject



46
47
48
49
50
# File 'lib/txml_importer.rb', line 46

def import
  reader = read_file
  parse_file(reader)
  [@doc[:tu][:vals], @doc[:seg][:vals]]
end

#statsObject



37
38
39
40
41
42
43
44
# File 'lib/txml_importer.rb', line 37

def stats
  if encoding.eql?('UTF-8')
    analyze_stats_utf_8
  else
    analyze_stats_utf_16
  end
  {tu_count: @doc[:tu][:counter], seg_count: @doc[:seg][:counter], language_pairs: @doc[:language_pairs].uniq}
end