Class: Hanzi

Inherits:
Object
  • Object
show all
Defined in:
lib/hanzi.rb

Class Attribute Summary collapse

Class Method Summary collapse

Class Attribute Details

.dataObject

Returns the value of attribute data.



6
7
8
# File 'lib/hanzi.rb', line 6

def data
  @data
end

.data_trieObject

Returns the value of attribute data_trie.



7
8
9
# File 'lib/hanzi.rb', line 7

def data_trie
  @data_trie
end

Class Method Details

.load_dataObject



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# File 'lib/hanzi.rb', line 9

def load_data
  return if @data
  @data = []
  @data_trie = Trie.new

  file_path = File.expand_path('../../lib/data/cedict_ts.u8', __FILE__)
  index = 0
  File.open(file_path).each_line do |line|
    next if line.start_with?('#')
    line = line.force_encoding('utf-8')

    # CC-CEDICT format:
    # Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/
    line_data = {}
    line_data[:traditional] = line[0, line.index(' ')]

    line = line[line.index(' ') + 1, line.length]
    line_data[:simplified] = line[0, line.index(' ')]

    line = line[line.index('['), line.length]
    line_data[:pinyin] = line[1, line.index(']') - 1].downcase

    line = line[line.index('/'), line.rindex('/')]
    line_data[:english] = line[1, line.rindex('/') - 1]

    existing_count_simplified = 0
    if find_first_hanzi_match(line_data[:simplified])
      existing_count_simplified = matching_entries(line_data[:simplified]).count
    end
    @data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index)

    if line_data[:simplified] != line_data[:traditional]
      existing_count_traditional = 0
      if find_first_hanzi_match(line_data[:traditional])
        existing_count_traditional = matching_entries(line_data[:traditional]).count
      end

      @data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index)
    end

    @data << line_data


    index += 1
  end
end

.matching_entries(text) ⇒ Object



115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# File 'lib/hanzi.rb', line 115

def matching_entries(text)
  load_data if @data.nil?

  results = []
  index = 0
  loop do
    id = @data_trie.get(text + index.to_s)
    break if !id

    results << @data[id]
    index += 1
  end

  results
end

.to_english(text) ⇒ Object



94
95
96
97
98
99
# File 'lib/hanzi.rb', line 94

def to_english(text)
  load_data if @data.nil?

  entry = find_first_hanzi_match(text)
  entry[:english] if entry && entry[:english]
end

.to_pinyin(text, options = {}) ⇒ Object



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/hanzi.rb', line 56

def to_pinyin(text, options={})
  load_data if @data.nil?

  result = ''
  pos = 0

  loop do
    char = text[pos]
    break if !char

    if char.ord < 0x4E00 || char.ord > 0x9FFF
      # it's not a chinese character.
      result << char
      pos += 1
    else
      # it's a chinese character. start by trying to find a long word match,
      # and if it fails, all the way down to a single hanzi.
      match = nil
      match_length = 0
      4.downto(1) do |length|
        match = find_first_hanzi_match(text[pos, length])
        match_length = length
        break if match
      end

      if match
        result << match[:pinyin].gsub("\s", '')
        pos += match_length
      else
        result << char
        pos += 1
      end
    end
  end

  result
end

.to_simplified(text) ⇒ Object



101
102
103
104
105
106
# File 'lib/hanzi.rb', line 101

def to_simplified(text)
  load_data if @data.nil?

  entry = find_first_hanzi_match(text)
  entry[:simplified] if entry && entry[:simplified]
end

.to_traditional(text) ⇒ Object



108
109
110
111
112
113
# File 'lib/hanzi.rb', line 108

def to_traditional(text)
  load_data if @data.nil?

  entry = find_first_hanzi_match(text)
  entry[:traditional] if entry && entry[:traditional]
end