Class: Ve::Parse::JapaneseTransliterators

Inherits:
Ve::Parse
  • Object
show all
Defined in:
lib/providers/japanese_transliterators.rb

Constant Summary collapse

H_SYLLABIC_N =
''
H_SMALL_TSU =
''
HIRA_TO_LATN =
{
  ""=>"a", ""=>"i", ""=>"u", ""=>"e", ""=>"o",
  ""=>"ka", ""=>"ki", ""=>"ku", ""=>"ke", ""=>"ko",
  ""=>"ga", ""=>"gi", ""=>"gu", ""=>"ge", ""=>"go",
  ""=>"sa", ""=>"shi", ""=>"su", ""=>"se", ""=>"so",
  ""=>"za", ""=>"ji", ""=>"zu", ""=>"ze", ""=>"zo",
  ""=>"ta", ""=>"chi", ""=>"tsu", ""=>"te", ""=>"to",
  ""=>"da", ""=>"ji", ""=>"zu", ""=>"de", ""=>"do",
  ""=>"na", ""=>"ni", ""=>"nu", ""=>"ne", ""=>"no",
  ""=>"ha", ""=>"hi", ""=>"fu", ""=>"he", ""=>"ho",
  ""=>"ba", ""=>"bi", ""=>"bu", ""=>"be", ""=>"bo",
  ""=>"pa", ""=>"pi", ""=>"pu", ""=>"pe", ""=>"po",
  ""=>"ma", ""=>"mi", ""=>"mu", ""=>"me", ""=>"mo",
  ""=>"ya", ""=>"yu", ""=>"yo",
  ""=>"ra", ""=>"ri", ""=>"ru", ""=>"re", ""=>"ro",
  ""=>"wa", "うぃ"=>"whi", "うぇ"=>"whe", ""=>"wo",
  ""=>"wye", ""=>"wyi", ""=>"-", ""=>"n",

  "きゃ"=>"kya", "きゅ"=>"kyu", "きょ"=>"kyo", "きぇ"=>"kye", "きぃ"=>"kyi",
  "ぎゃ"=>"gya", "ぎゅ"=>"gyu", "ぎょ"=>"gyo", "ぎぇ"=>"gye", "ぎぃ"=>"gyi",
  "くぁ"=>"kwa", "くぃ"=>"kwi", "くぅ"=>"kwu", "くぇ"=>"kwe", "くぉ"=>"kwo",
  "ぐぁ"=>"qwa", "ぐぃ"=>"gwi", "ぐぅ"=>"gwu", "ぐぇ"=>"gwe", "ぐぉ"=>"gwo",
  "しゃ"=>"sha", "しぃ"=>"syi", "しゅ"=>"shu", "しぇ"=>"she", "しょ"=>"sho",
  "じゃ"=>"jya", "じゅ"=>"zyu", "じぇ"=>"zye", "じょ"=>"zyo", "じぃ"=>"zyi",
  "すぁ"=>"swa", "すぃ"=>"swi", "すぅ"=>"swu", "すぇ"=>"swe", "すぉ"=>"swo",
  "ちゃ"=>"tya", "ちゅ"=>"tyu", "ちぇ"=>"tye", "ちょ"=>"tyo", "ちぃ"=>"tyi",
  "ぢゃ"=>"dya", "ぢぃ"=>"dyi", "ぢゅ"=>"dyu", "ぢぇ"=>"dye", "ぢょ"=>"dyo",
  "つぁ"=>"tsa", "つぃ"=>"tsi", "つぇ"=>"tse", "つぉ"=>"tso", "てゃ"=>"tha",
  "てぃ"=>"thi", "てゅ"=>"thu", "てぇ"=>"the", "てょ"=>"tho", "とぁ"=>"twa",
  "とぃ"=>"twi", "とぅ"=>"twu", "とぇ"=>"twe", "とぉ"=>"two", "でゃ"=>"dha",
  "でぃ"=>"dhi", "でゅ"=>"dhu", "でぇ"=>"dhe", "でょ"=>"dho", "どぁ"=>"dwa",
  "どぃ"=>"dwi", "どぅ"=>"dwu", "どぇ"=>"dwe", "どぉ"=>"dwo", "にゃ"=>"nya",
  "にゅ"=>"nyu", "にょ"=>"nyo", "にぇ"=>"nye", "にぃ"=>"nyi", "ひゃ"=>"hya",
  "ひぃ"=>"hyi", "ひゅ"=>"hyu", "ひぇ"=>"hye", "ひょ"=>"hyo", "びゃ"=>"bya",
  "びぃ"=>"byi", "びゅ"=>"byu", "びぇ"=>"bye", "びょ"=>"byo", "ぴゃ"=>"pya",
  "ぴぃ"=>"pyi", "ぴゅ"=>"pyu", "ぴぇ"=>"pye", "ぴょ"=>"pyo", "ふぁ"=>"fwa",
  "ふぃ"=>"fyi", "ふぇ"=>"fye", "ふぉ"=>"fwo", "ふぅ"=>"fwu", "ふゃ"=>"fya",
  "ふゅ"=>"fyu", "ふょ"=>"fyo", "みゃ"=>"mya", "みぃ"=>"myi", "みゅ"=>"myu",
  "みぇ"=>"mye", "みょ"=>"myo", "りゃ"=>"rya", "りぃ"=>"ryi", "りゅ"=>"ryu",
  "りぇ"=>"rye", "りょ"=>"ryo",
  "ゔぁ"=>"va", "ゔぃ"=>"vyi", ""=>"vu", "ゔぇ"=>"vye", "ゔぉ"=>"vo",
  "ゔゃ"=>"vya", "ゔゅ"=>"vyu", "ゔょ"=>"vyo",
  "うぁ"=>"wha", "いぇ"=>"ye", "うぉ"=>"who",
  ""=>"xa", ""=>"xi", ""=>"xu", ""=>"xe", ""=>"xo",
  ""=>"xka", ""=>"xke", ""=>"xwa"
}
LATN_TO_HIRA =
{
  'a'   => '', 'i'   => '',                'u'  => '',               'e'  => '',   'o'  => '',
  'ka'  => '', 'ki'  => '',                'ku' => '',               'ke' => '',   'ko' => '',
  'ga'  => '', 'gi'  => '',                'gu' => '',               'ge' => '',   'go' => '',
  'sa'  => '', 'si'  => '', 'shi' => '', 'su' => '',               'se' => '',   'so' => '',
  'za'  => '', 'zi'  => '', 'ji'  => '', 'zu' => '',               'ze' => '',   'zo' => '',
  'ta'  => '', 'ti'  => '', 'chi' => '', 'tu' => '', 'tsu'=> '', 'te' => '',   'to' => '',
  'da'  => '', 'di'  => '',                'du' => '', 'dzu'=> '', 'de' => '',   'do' => '',
  'na'  => '', 'ni'  => '',                'nu' => '',               'ne' => '',   'no' => '',
  'ha'  => '', 'hi'  => '',                'hu' => '', 'fu' => '', 'he' => '',   'ho' => '',
  'ba'  => '', 'bi'  => '',                'bu' => '',               'be' => '',   'bo' => '',
  'pa'  => '', 'pi'  => '',                'pu' => '',               'pe' => '',   'po' => '',
  'ma'  => '', 'mi'  => '',                'mu' => '',               'me' => '',   'mo' => '',
  'ya'  => '',                               'yu' => '',                               'yo' => '',
  'ra'  => '', 'ri'  => '',                'ru' => '',               're' => '',   'ro' => '',
  'la'  => '', 'li'  => '',                'lu' => '',               'le' => '',   'lo' => '',
  'wa'  => '', 'wi'  => 'うぃ',                                          'we' => 'うぇ', 'wo' => '',
  'wye' => '', 'wyi' => '', '-' => '',

  'n'   => '', 'nn'  => '', "n'"=> '',

  'kya' => 'きゃ', 'kyu' => 'きゅ', 'kyo' => 'きょ', 'kye' => 'きぇ', 'kyi' => 'きぃ',
  'gya' => 'ぎゃ', 'gyu' => 'ぎゅ', 'gyo' => 'ぎょ', 'gye' => 'ぎぇ', 'gyi' => 'ぎぃ',
  'kwa' => 'くぁ', 'kwi' => 'くぃ', 'kwu' => 'くぅ', 'kwe' => 'くぇ', 'kwo' => 'くぉ',
  'gwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',
  'qwa' => 'ぐぁ', 'gwi' => 'ぐぃ', 'gwu' => 'ぐぅ', 'gwe' => 'ぐぇ', 'gwo' => 'ぐぉ',

  'sya' => 'しゃ', 'syi' => 'しぃ', 'syu' => 'しゅ', 'sye' => 'しぇ', 'syo' => 'しょ',
  'sha' => 'しゃ',                  'shu' => 'しゅ', 'she' => 'しぇ', 'sho' => 'しょ',
  'ja'  => 'じゃ',                  'ju'  => 'じゅ', 'je'  => 'じぇ', 'jo'  => 'じょ',
  'jya' => 'じゃ', 'jyi' => 'じぃ', 'jyu' => 'じゅ', 'jye' => 'じぇ', 'jyo' => 'じょ',
  'zya' => 'じゃ', 'zyu' => 'じゅ', 'zyo' => 'じょ', 'zye' => 'じぇ', 'zyi' => 'じぃ',
  'swa' => 'すぁ', 'swi' => 'すぃ', 'swu' => 'すぅ', 'swe' => 'すぇ', 'swo' => 'すぉ',

  'cha' => 'ちゃ',                  'chu' => 'ちゅ', 'che' => 'ちぇ', 'cho' => 'ちょ',
  'cya' => 'ちゃ', 'cyi' => 'ちぃ', 'cyu' => 'ちゅ', 'cye' => 'ちぇ', 'cyo' => 'ちょ',
  'tya' => 'ちゃ', 'tyi' => 'ちぃ', 'tyu' => 'ちゅ', 'tye' => 'ちぇ', 'tyo' => 'ちょ',
  'dya' => 'ぢゃ', 'dyi' => 'ぢぃ', 'dyu' => 'ぢゅ', 'dye' => 'ぢぇ', 'dyo' => 'ぢょ',
  'tsa' => 'つぁ', 'tsi' => 'つぃ',                  'tse' => 'つぇ', 'tso' => 'つぉ',
  'tha' => 'てゃ', 'thi' => 'てぃ', 'thu' => 'てゅ', 'the' => 'てぇ', 'tho' => 'てょ',
  'twa' => 'とぁ', 'twi' => 'とぃ', 'twu' => 'とぅ', 'twe' => 'とぇ', 'two' => 'とぉ',
  'dha' => 'でゃ', 'dhi' => 'でぃ', 'dhu' => 'でゅ', 'dhe' => 'でぇ', 'dho' => 'でょ',
  'dwa' => 'どぁ', 'dwi' => 'どぃ', 'dwu' => 'どぅ', 'dwe' => 'どぇ', 'dwo' => 'どぉ',

  'nya' => 'にゃ', 'nyu' => 'にゅ', 'nyo' => 'にょ', 'nye' => 'にぇ', 'nyi' => 'にぃ',

  'hya' => 'ひゃ', 'hyi' => 'ひぃ', 'hyu' => 'ひゅ', 'hye' => 'ひぇ', 'hyo' => 'ひょ',
  'bya' => 'びゃ', 'byi' => 'びぃ', 'byu' => 'びゅ', 'bye' => 'びぇ', 'byo' => 'びょ',
  'pya' => 'ぴゃ', 'pyi' => 'ぴぃ', 'pyu' => 'ぴゅ', 'pye' => 'ぴぇ', 'pyo' => 'ぴょ',
  'fa'  => 'ふぁ', 'fi'  => 'ふぃ',                  'fe'  => 'ふぇ', 'fo'  => 'ふぉ',
  'fwa' => 'ふぁ', 'fwi' => 'ふぃ', 'fwu' => 'ふぅ', 'fwe' => 'ふぇ', 'fwo' => 'ふぉ',
  'fya' => 'ふゃ', 'fyi' => 'ふぃ', 'fyu' => 'ふゅ', 'fye' => 'ふぇ', 'fyo' => 'ふょ',

  'mya' => 'みゃ', 'myi' => 'みぃ', 'myu' => 'みゅ', 'mye' => 'みぇ', 'myo' => 'みょ',

  'rya' => 'りゃ', 'ryi' => 'りぃ', 'ryu' => 'りゅ', 'rye' => 'りぇ', 'ryo' => 'りょ',
  'lya' => 'りゃ', 'lyu' => 'りゅ', 'lyo' => 'りょ', 'lye' => 'りぇ', 'lyi' => 'りぃ',

  'va'  => 'ゔぁ', 'vi'  => 'ゔぃ', 'vu'  => '',   've'  => 'ゔぇ',  'vo' => 'ゔぉ',
  'vya' => 'ゔゃ', 'vyi' => 'ゔぃ', 'vyu' => 'ゔゅ', 'vye' => 'ゔぇ', 'vyo' => 'ゔょ',
  'wha' => 'うぁ', 'whi' => 'うぃ', 'ye'  => 'いぇ', 'whe' => 'うぇ', 'who' => 'うぉ',

  'xa'  => '', 'xi'   => '', 'xu'  => '', 'xe'  => '', 'xo'   => '',
  'xya' => '', 'xyu'  => '', 'xyo' => '',
  'xtu' => '', 'xtsu' => '',
  'xka' => '', 'xke'  => '', 'xwa' => '',

  '@@' => ' ', '#[' => '', '#]' => '', '#,' => '', '#.' => '', '#/' => '',
}

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Ve::Parse

#as_json

Constructor Details

#initialize(text) ⇒ JapaneseTransliterators

Returns a new instance of JapaneseTransliterators.



148
149
150
151
# File 'lib/providers/japanese_transliterators.rb', line 148

def initialize(text)
  @tokens = []
  @text = text
end

Instance Attribute Details

#textObject (readonly)

Returns the value of attribute text.



146
147
148
# File 'lib/providers/japanese_transliterators.rb', line 146

def text
  @text
end

#tokensObject (readonly)

Returns the value of attribute tokens.



146
147
148
# File 'lib/providers/japanese_transliterators.rb', line 146

def tokens
  @tokens
end

Instance Method Details

#transliterate_from_fullwidth_to_halfwidthObject



262
263
264
265
# File 'lib/providers/japanese_transliterators.rb', line 262

def transliterate_from_fullwidth_to_halfwidth
  res = transpose_codepoints_in_range(@text, -65248, 65281..65374)
  transpose_codepoints_in_range(res, -12256, 12288..12288)
end

#transliterate_from_halfwidth_to_fullwidthObject



267
268
269
270
# File 'lib/providers/japanese_transliterators.rb', line 267

def transliterate_from_halfwidth_to_fullwidth
  res = transpose_codepoints_in_range(@text, 65248, 33..126)
  transpose_codepoints_in_range(res, 12256, 32..32)
end

#transliterate_from_hira_to_kanaObject



258
259
260
# File 'lib/providers/japanese_transliterators.rb', line 258

def transliterate_from_hira_to_kana
  transpose_codepoints_in_range(@text, 96, 12353..12438)
end

#transliterate_from_hira_to_latnObject



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# File 'lib/providers/japanese_transliterators.rb', line 158

def transliterate_from_hira_to_latn
  # Hepburn style romaji
  kana = @text.dup
  romaji = ''
  geminate = false

  while kana.length > 0
    [2, 1].each do |length|
      mora = ''
      for_conversion = kana[0, length]

      if for_conversion == H_SMALL_TSU
        geminate = true
        kana[0, length] = ''
        break
      elsif for_conversion == H_SYLLABIC_N && kana[1, 1].match(/[やゆよ]/)
        # Syllabic N before ya, yu or yo
        mora = "n'"
      elsif HIRA_TO_LATN[for_conversion]
        # Generic cases
        mora = HIRA_TO_LATN[for_conversion]
      end

      if mora.length > 0
        if geminate
          geminate = false
          romaji << mora[0, 1]
        end
        romaji << mora
        kana[0, length] = ''
        break
      elsif length == 1
        # Nothing found
        romaji << for_conversion
        kana[0, length] = ''
      end
    end
  end

  return romaji
end

#transliterate_from_hrkt_to_latnObject



153
154
155
156
# File 'lib/providers/japanese_transliterators.rb', line 153

def transliterate_from_hrkt_to_latn
  @text = transliterate_from_kana_to_hira
  transliterate_from_hira_to_latn
end

#transliterate_from_kana_to_hiraObject



254
255
256
# File 'lib/providers/japanese_transliterators.rb', line 254

def transliterate_from_kana_to_hira
  transpose_codepoints_in_range(@text, -96, 12449..12534)
end

#transliterate_from_latn_to_hrktObject



200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
# File 'lib/providers/japanese_transliterators.rb', line 200

def transliterate_from_latn_to_hrkt
  romaji = @text.dup
  kana = ''

  romaji.gsub!(/m([BbPp])/, 'n\1')
  romaji.gsub!(/M([BbPp])/, 'N\1')

  while romaji.length > 0
    [3, 2, 1].each do |length|
      mora = ''
      for_removal = length
      for_conversion = romaji[0, length]
      is_upper = !!(for_conversion.match(/^\p{Upper}/))
      for_conversion.downcase!

      if for_conversion.match(/nn[aiueo]/)
        # nna should kanafy to んな instead of んあ
        # This is what people expect for words like konna, anna, zannen
        mora = H_SYLLABIC_N
        for_removal = 1
      elsif LATN_TO_HIRA[for_conversion]
        # Generic cases
        mora = LATN_TO_HIRA[for_conversion]
      elsif for_conversion == 'tch' || ( length == 2 && for_conversion.match(/([kgsztdnbpmyrlwc])\1/))
        # tch and double-consonants for small tsu 
        mora = H_SMALL_TSU
        for_removal = 1
      end

      if mora.length > 0
        if is_upper
          # Dance so we can call transliterate_from_hira_to_kana on internal data
          # TODO: Need a better way for this
          temp_text = @text
          @text = mora.dup
          kana << transliterate_from_hira_to_kana
          @text = temp_text
        else
          kana << mora
        end
        
        romaji[0, for_removal] = ''
        break
      elsif length == 1
        # Nothing found
        kana << for_conversion
        romaji[0, 1] = ''
      end
    end
  end

  return kana
end