Class: Hanzi
- Inherits:
-
Object
- Object
- Hanzi
- Defined in:
- lib/hanzi.rb
Class Attribute Summary collapse
-
.data ⇒ Object
Returns the value of attribute data.
-
.data_trie ⇒ Object
Returns the value of attribute data_trie.
Class Method Summary collapse
- .load_data ⇒ Object
- .matching_entries(text) ⇒ Object
- .to_english(text) ⇒ Object
- .to_pinyin(text, options = {}) ⇒ Object
- .to_simplified(text) ⇒ Object
- .to_traditional(text) ⇒ Object
Class Attribute Details
.data ⇒ Object
Returns the value of attribute data.
6 7 8 |
# File 'lib/hanzi.rb', line 6 def data @data end |
.data_trie ⇒ Object
Returns the value of attribute data_trie.
7 8 9 |
# File 'lib/hanzi.rb', line 7 def data_trie @data_trie end |
Class Method Details
.load_data ⇒ Object
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/hanzi.rb', line 9 def load_data return if @data @data = [] @data_trie = Trie.new file_path = File.('../../lib/data/cedict_ts.u8', __FILE__) index = 0 File.open(file_path).each_line do |line| next if line.start_with?('#') line = line.force_encoding('utf-8') # CC-CEDICT format: # Traditional Simplified [pin1 yin1] /English equivalent 1/equivalent 2/ line_data = {} line_data[:traditional] = line[0, line.index(' ')] line = line[line.index(' ') + 1, line.length] line_data[:simplified] = line[0, line.index(' ')] line = line[line.index('['), line.length] line_data[:pinyin] = line[1, line.index(']') - 1].downcase line = line[line.index('/'), line.rindex('/')] line_data[:english] = line[1, line.rindex('/') - 1] existing_count_simplified = 0 if find_first_hanzi_match(line_data[:simplified]) existing_count_simplified = matching_entries(line_data[:simplified]).count end @data_trie.add(line_data[:simplified] + existing_count_simplified.to_s, index) if line_data[:simplified] != line_data[:traditional] existing_count_traditional = 0 if find_first_hanzi_match(line_data[:traditional]) existing_count_traditional = matching_entries(line_data[:traditional]).count end @data_trie.add(line_data[:traditional] + existing_count_traditional.to_s, index) end @data << line_data index += 1 end end |
.matching_entries(text) ⇒ Object
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
# File 'lib/hanzi.rb', line 115 def matching_entries(text) load_data if @data.nil? results = [] index = 0 loop do id = @data_trie.get(text + index.to_s) break if !id results << @data[id] index += 1 end results end |
.to_english(text) ⇒ Object
94 95 96 97 98 99 |
# File 'lib/hanzi.rb', line 94 def to_english(text) load_data if @data.nil? entry = find_first_hanzi_match(text) entry[:english] if entry && entry[:english] end |
.to_pinyin(text, options = {}) ⇒ Object
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
# File 'lib/hanzi.rb', line 56 def (text, ={}) load_data if @data.nil? result = '' pos = 0 loop do char = text[pos] break if !char if char.ord < 0x4E00 || char.ord > 0x9FFF # it's not a chinese character. result << char pos += 1 else # it's a chinese character. start by trying to find a long word match, # and if it fails, all the way down to a single hanzi. match = nil match_length = 0 4.downto(1) do |length| match = find_first_hanzi_match(text[pos, length]) match_length = length break if match end if match result << match[:pinyin].gsub("\s", '') pos += match_length else result << char pos += 1 end end end result end |
.to_simplified(text) ⇒ Object
101 102 103 104 105 106 |
# File 'lib/hanzi.rb', line 101 def to_simplified(text) load_data if @data.nil? entry = find_first_hanzi_match(text) entry[:simplified] if entry && entry[:simplified] end |
.to_traditional(text) ⇒ Object
108 109 110 111 112 113 |
# File 'lib/hanzi.rb', line 108 def to_traditional(text) load_data if @data.nil? entry = find_first_hanzi_match(text) entry[:traditional] if entry && entry[:traditional] end |