Class: Japanese

Inherits:
Object
  • Object
show all
Defined in:
lib/lib/japanese.rb

Constant Summary collapse

JAPANESE_PONCTUATION =
{
    ' ' => ' ',
    '' => ',',
    '' => '.',
    '' => ':',
    '' => '!',
    '' => '?',
    '' => '~',
    '' => '...',
    '' => '..',
    '' => ' \'',
    '' => '\'',
    '' => ' "',
    '' => '"',
    '' => ' "',
    '' => '"',
    '' => ' (',
    '' => ')',
    '' => ' [',
    '' => ']',
    '' => ' {',
    '' => '}',
}.freeze

Class Method Summary collapse

Class Method Details

.parse(text) ⇒ Object



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/lib/japanese.rb', line 41

def self.parse(text)
  mecab_parsed = MecabStandalone.parse(text)
    .split("\n")
    .map{|k| k.split("\t")}.tap(&:pop)
    .map{|k| [k[0]].concat(k[1].split(','))}
    .map{|k| [k[0], k[1], k[-2]]}
  tokenized_kana = []
  mecab_parsed.each do |token|
    if token[1] == "助動詞"
      tokenized_kana[-1][0] +=  token[0]
      tokenized_kana[-1][-1] += token[-1]
    elsif token[-1] == '*'
      tokenized_kana << [token[0], token[1], token[0]]
    else
      tokenized_kana << token
    end
  end
  tokenized_kana
end

.t(text) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
# File 'lib/lib/japanese.rb', line 29

def self.t(text)
  latin = text.dup
  parsed = parse(text)
  parsed.each do |token|
    if token[-1]=~ /\p{Katakana}/
      latin.sub!(token[0], ' ' + Romaji.kana2romaji(token[-1]) )
    end
  end
  JAPANESE_PONCTUATION.each { |k,v| latin.gsub!(k, v)}
  latin
end