Module: Neologdish::Normalizer
- Defined in:
- lib/neologdish/normalizer.rb,
lib/neologdish/normalizer/version.rb
Overview
A Japanese text normalizer module according to the neologd convention.
Constant Summary collapse
- VERSION =
: String
'0.1.0'
Class Method Summary collapse
-
.normalize(str, override_conversion_map = {}) ⇒ Object
Normalize the given text.
Class Method Details
.normalize(str, override_conversion_map = {}) ⇒ Object
Normalize the given text.
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# File 'lib/neologdish/normalizer.rb', line 74 def normalize(str, override_conversion_map = {}) conversion_map = CONVERSION_MAP.merge(override_conversion_map) squeezee = '' prev_latin = false whitespace_encountered = false encountered_half_width_kana = nil normalized = str.chars.map do |c| prefix = '' c = conversion_map[c] || c # normalize the Half-width kana to full-width if encountered_half_width_kana if (c == '゙' && (k = DAKUON_KANA_MAP[encountered_half_width_kana])) || (c == '゚' && (k = HANDAKUON_KANA_MAP[encountered_half_width_kana])) c = '' prefix = k else prefix = encountered_half_width_kana end end if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c]) c = '' end # squash consecutive special characters (space or long-vowel) if [' ', 'ー'].include?(c) if squeezee == c c = '' else squeezee = c end else squeezee = '' end # remove the white space character in the middle of non-latin characters is_latin = LATIN_MAP[c] || false if c == ' ' whitespace_encountered = prev_latin c = '' else prefix = ' ' if is_latin && whitespace_encountered whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side end prev_latin = is_latin prefix + c end.join + (encountered_half_width_kana || '') normalized.strip end |