Module: Neologdish::Normalizer

Defined in:
lib/neologdish/normalizer.rb,
lib/neologdish/normalizer/version.rb

Overview

A Japanese text normalizer module according to the neologd convention.

Constant Summary collapse

VERSION =

: String

'0.1.0'

Class Method Summary collapse

Class Method Details

.normalize(str, override_conversion_map = {}) ⇒ Object

Normalize the given text.



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# File 'lib/neologdish/normalizer.rb', line 74

def normalize(str, override_conversion_map = {})
  conversion_map = CONVERSION_MAP.merge(override_conversion_map)

  squeezee = ''
  prev_latin = false
  whitespace_encountered = false
  encountered_half_width_kana = nil
  normalized = str.chars.map do |c|
    prefix = ''
    c = conversion_map[c] || c

    # normalize the Half-width kana to full-width
    if encountered_half_width_kana
      if (c == '' && (k = DAKUON_KANA_MAP[encountered_half_width_kana])) ||
         (c == '' && (k = HANDAKUON_KANA_MAP[encountered_half_width_kana]))
        c = ''
        prefix = k
      else
        prefix = encountered_half_width_kana
      end
    end

    if (encountered_half_width_kana = HALF_WIDTH_KANA_MAP[c])
      c = ''
    end

    # squash consecutive special characters (space or long-vowel)
    if [' ', ''].include?(c)
      if squeezee == c
        c = ''
      else
        squeezee = c
      end
    else
      squeezee = ''
    end

    # remove the white space character in the middle of non-latin characters
    is_latin = LATIN_MAP[c] || false
    if c == ' '
      whitespace_encountered = prev_latin
      c = ''
    else
      prefix = ' ' if is_latin && whitespace_encountered
      whitespace_encountered &&= c == '' # take care for consecutive spaces on the right side
    end
    prev_latin = is_latin

    prefix + c
  end.join + (encountered_half_width_kana || '')

  normalized.strip
end