Module: HangulTools
- Defined in:
- lib/hangul_tools.rb,
lib/hangul_tools/version.rb
Overview
Courtesy of algorithms described at: gernot-katzers-spice-pages.com/var/korean_hangul_unicode.html
Defined Under Namespace
Modules: Version
Constant Summary collapse
- LEADS =
[ nil, 'g', 'gg', 'n', 'd', 'dd', 'r', 'm', 'b', 'bb' ,'s', 'ss', nil, 'j', 'jj', 'ch', 'k', 't', 'p', 'h' ]
- TAILS =
[ nil, 'g', 'gg', 'gs', 'n', 'nj', 'nh', 'd', 'l', 'lg', 'lm', 'lb', 'ls', 'lt', 'lp', 'lh', 'm', 'b', 'bs', 's', 'ss', 'ng', 'j', 'ch', 'k', 't', 'p', 'h' ]
- VOWELS =
{ revised: [ nil, 'a', 'ae', 'ya', 'yae', 'eo', 'e', 'yeo', 'ye', 'o', 'wa', 'wae', 'oe', 'yo', 'u', 'weo', 'we', 'wi', 'yu', 'eu', 'yi', 'i' ], mccune_reischauer: [ nil, 'a', 'ae', 'ya', 'yae', 'ŏ', 'e', 'yŏ', 'ye', 'o', 'wa', 'wae', 'oe', 'yo', 'u', 'wŏ', 'we', 'wi', 'yu', 'ŭ', 'ŭi', 'i' ] }
- BLENDS =
{ revised: {}, mccune_reischauer: { "si" => "shi", "sy" => "shy", "swi" => "shwi" } }
Class Method Summary collapse
-
.decompose(text) ⇒ Object
it is assumed that ‘text` contains nothing but hangul codepoints.
- .matrices ⇒ Object
- .parse_matrix(lines) ⇒ Object
- .romanize(text, system: :revised, initial: :initial) ⇒ Object
- .romanize_with_system(text, system, voiced) ⇒ Object
Class Method Details
.decompose(text) ⇒ Object
it is assumed that ‘text` contains nothing but hangul codepoints
22 23 24 25 26 27 28 29 30 |
# File 'lib/hangul_tools.rb', line 22 def self.decompose(text) text.codepoints.map do |point| tail = (point - 44032) % 28 vowel = 1 + ((point - 44032 - tail) % 588) / 28 lead = 1 + (point - 44032) / 588 [lead, vowel, tail] end end |
.matrices ⇒ Object
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/hangul_tools.rb', line 71 def self.matrices @matrices ||= {}.tap do |hash| raw = File.read(__FILE__).lines split_at = raw.index("__END__\n") key = lines = nil raw[(split_at+1)..-1].each do |line| if line =~ /^(\w+):$/ hash[key.to_sym] = parse_matrix(lines) if lines key = $1 lines = [] elsif line !~ /^$/ lines << line end end hash[key.to_sym] = parse_matrix(lines) if lines end end |
.parse_matrix(lines) ⇒ Object
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# File 'lib/hangul_tools.rb', line 91 def self.parse_matrix(lines) lead = lines.first.split(/\s+/)[1..-1].map do |v| if v == '_' nil elsif v == 'final' :final else v end end matrix = {} lines[1..-1].each do |line| tail, *sounds = line.split(/\s+/) if tail == 'initial' tail = :initial elsif tail == 'voiced' tail = :voiced elsif tail == '_' tail = nil end sounds.map! { |s| s == '_' ? nil : s } matrix[tail] = Hash[lead.zip(sounds)] end matrix end |
.romanize(text, system: :revised, initial: :initial) ⇒ Object
5 6 7 8 9 10 11 12 13 14 15 16 |
# File 'lib/hangul_tools.rb', line 5 def self.romanize(text, system: :revised, initial: :initial) matrix = matrices[system] vowels = VOWELS[system] text.scan(/[\uAC00-\uD7a3]+|[^\uAC00-\uD7a3]+/).map.with_index do |string, idx| if string =~ /[\uAC00-\uD7a3]/ romanize_with_system(string, system, idx > 0 ? :voiced : initial) else string end end.join end |
.romanize_with_system(text, system, voiced) ⇒ Object
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/hangul_tools.rb', line 32 def self.romanize_with_system(text, system, voiced) matrix = matrices[system] vowels = VOWELS[system] blends = BLENDS[system] syllables = decompose(text) phonemes = [] syllables.each.with_index do |(lead, vowel, tail), idx| prior = (idx > 0) ? TAILS[syllables[idx-1][2].to_i] : voiced final = syllables[idx+1] ? false : true phonemes << (matrix[prior] || {})[LEADS[lead]] phonemes << vowels[vowel] if final phonemes << (matrix[TAILS[tail]] || {})[:final] end end result = phonemes.compact.join blends.each do |pattern, blend| result = result.gsub(pattern, blend) end result end |