Class: Japanese
- Inherits:
-
Object
- Object
- Japanese
- Defined in:
- lib/lib/japanese.rb
Constant Summary collapse
- JAPANESE_PONCTUATION =
{ ' ' => ' ', '、' => ',', '。' => '.', ':' => ':', '!' => '!', '?' => '?', '〜' => '~', '…' => '...', '‥' => '..', '「 ' => ' \'', '」' => '\'', '『 ' => ' "', '』' => '"', '〝 ' => ' "', '〟' => '"', '( ' => ' (', ')' => ')', '【 ' => ' [', '】' => ']', '{ ' => ' {', '}' => '}', }.freeze
Class Method Summary collapse
Class Method Details
.parse(text) ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/lib/japanese.rb', line 41 def self.parse(text) mecab_parsed = MecabStandalone.parse(text) .split("\n") .map{|k| k.split("\t")}.tap(&:pop) .map{|k| [k[0]].concat(k[1].split(','))} .map{|k| [k[0], k[1], k[-2]]} tokenized_kana = [] mecab_parsed.each do |token| if token[1] == "助動詞" tokenized_kana[-1][0] += token[0] tokenized_kana[-1][-1] += token[-1] elsif token[-1] == '*' tokenized_kana << [token[0], token[1], token[0]] else tokenized_kana << token end end tokenized_kana end |
.t(text) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/lib/japanese.rb', line 29 def self.t(text) latin = text.dup parsed = parse(text) parsed.each do |token| if token[-1]=~ /\p{Katakana}/ latin.sub!(token[0], ' ' + Romaji.kana2romaji(token[-1]) ) end end JAPANESE_PONCTUATION.each { |k,v| latin.gsub!(k, v)} latin end |