Class: Ve::Parse::MecabIpadic

Inherits:
Ve::Parse show all
Defined in:
lib/providers/mecab_ipadic.rb

Constant Summary collapse

PARSER =
%r{^ (.+?) \t (.+) }x
MEISHI =

PoS

'名詞'
KOYUUMEISHI =
'固有名詞'
DAIMEISHI =
'代名詞'
JODOUSHI =
'助動詞'
KAZU =
''
JOSHI =
'助詞'
SETTOUSHI =
'接頭詞'
DOUSHI =
'動詞'
KIGOU =
'記号'
FIRAA =
'フィラー'
SONOTA =
'その他'
KANDOUSHI =
'感動詞'
RENTAISHI =
'連体詞'
SETSUZOKUSHI =
'接続詞'
FUKUSHI =
'副詞'
SETSUZOKUJOSHI =
'接続助詞'
KEIYOUSHI =
'形容詞'
HIJIRITSU =

Pos2 and Inflection types

'非自立'
FUKUSHIKANOU =
'副詞可能'
SAHENSETSUZOKU =
'サ変接続'
KEIYOUDOUSHIGOKAN =
'形容動詞語幹'
NAIKEIYOUSHIGOKAN =
'ナイ形容詞語幹'
JODOUSHIGOKAN =
'助動詞語幹'
FUKUSHIKA =
'副詞化'
TAIGENSETSUZOKU =
'体言接続'
RENTAIKA =
'連体化'
TOKUSHU =
'特殊'
SETSUBI =
'接尾'
SETSUZOKUSHITEKI =
'接続詞的'
DOUSHIHIJIRITSUTEKI =
'動詞非自立的'
SAHEN_SURU =
'サ変・スル'
TOKUSHU_TA =
'特殊・タ'
TOKUSHU_NAI =
'特殊・ナイ'
TOKUSHU_TAI =
'特殊・タイ'
TOKUSHU_DESU =
'特殊・デス'
TOKUSHU_DA =
'特殊・ダ'
TOKUSHU_MASU =
'特殊・マス'
TOKUSHU_NU =
'特殊・ヌ'
FUHENKAGATA =
'不変化型'
JINMEI =
'人名'
MEIREI_I =
'命令i'
KAKARIJOSHI =
'係助詞'
NA =

Etc

''
NI =
''
TE =
''
DE =
''
BA =
''
NN =
''
SA =
''

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods inherited from Ve::Parse

#as_json

Constructor Details

#initialize(text, output) ⇒ MecabIpadic

Returns a new instance of MecabIpadic.



72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/providers/mecab_ipadic.rb', line 72

def initialize(text, output)
  @tokens = []
  @text = text
  position = 0

  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}
    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end

    if line =~ %r{^ EOS $}x
      token[:type] = :sentence_split
      token[:literal] = ''
    elsif md = PARSER.match(line)
      # The parsed token
      token[:type] = :parsed
      token[:literal] = md[1]
      info = md[2].split(',')
      [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
        token[attr] = info[i]
      end

      # Anything unparsed preceding this token
      unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
        position += unparsed_token[:literal].length
      end

      token[:characters] = (position..(position+token[:literal].length-1))
      position += token[:literal].length
    else
      # C'est une catastrophe
    end

    @tokens << token
  end
end

Instance Attribute Details

#textObject (readonly)

Returns the value of attribute text.



70
71
72
# File 'lib/providers/mecab_ipadic.rb', line 70

def text
  @text
end

#tokensObject (readonly)

Returns the value of attribute tokens.



70
71
72
# File 'lib/providers/mecab_ipadic.rb', line 70

def tokens
  @tokens
end

Instance Method Details

#sentencesObject



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# File 'lib/providers/mecab_ipadic.rb', line 353

def sentences
  # TODO: Sentence objects that keep track of the sentence's tokens
  sentences = []
  current = ''

  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    elsif token[:literal] == ''
      current << token[:literal]
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end

  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0

  sentences
end

#wordsObject



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# File 'lib/providers/mecab_ipadic.rb', line 177

def words
  words = []
  tokens = @tokens.find_all { |t| t[:type] == :parsed }
  tokens = tokens.to_enum
  previous = nil

  # This is becoming very big
  begin
    while token = tokens.next
      pos = nil
      grammar = nil
      eat_next = false
      eat_lemma = true
      attach_to_previous = false
      also_attach_to_lemma = false
      update_pos = false

      case token[:pos]
      when MEISHI
        pos = Ve::PartOfSpeech::Noun

        case token[:pos2]
        when KOYUUMEISHI
          pos = Ve::PartOfSpeech::ProperNoun
        when DAIMEISHI
          pos = Ve::PartOfSpeech::Pronoun
        when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN
          if tokens.more?
            following = tokens.peek
            if following[:inflection_type] == SAHEN_SURU
              pos = Ve::PartOfSpeech::Verb
              eat_next = true
            elsif following[:inflection_type] == TOKUSHU_DA
              pos = Ve::PartOfSpeech::Adjective
              if following[:inflection_form] == TAIGENSETSUZOKU
                eat_next = true
                eat_lemma = false
              end
            elsif following[:inflection_type] == TOKUSHU_NAI
              pos = Ve::PartOfSpeech::Adjective
              eat_next = true
            elsif following[:pos] == JOSHI && following[:literal] == NI
              pos = Ve::PartOfSpeech::Adverb
              eat_next = false
            end
          end
        when HIJIRITSU, TOKUSHU
          if tokens.more?
            following = tokens.peek
            case token[:pos3]
            when FUKUSHIKANOU
              if following[:pos] == JOSHI && following[:literal] == NI
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when JODOUSHIGOKAN
              if following[:inflection_type] == TOKUSHU_DA
                pos = Ve::PartOfSpeech::Verb
                grammar = :auxillary
                if following[:inflection_form] == TAIGENSETSUZOKU
                  eat_next = true
                end
              elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when KEIYOUDOUSHIGOKAN
              pos = Ve::PartOfSpeech::Adjective
              if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA
                eat_next = true
              end
            end
          end
        when KAZU
          # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾
          pos = Ve::PartOfSpeech::Number
          if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number
            attach_to_previous = true
            also_attach_to_lemma = true
          end
        when SETSUBI
          if token[:pos3] == TOKUSHU && token[:lemma] == SA
            attach_to_previous = true
            update_pos = true
            pos = Ve::PartOfSpeech::Noun
          else
            pos = Ve::PartOfSpeech::Suffix
          end
        when SETSUZOKUSHITEKI
          pos = Ve::PartOfSpeech::Conjunction
        when DOUSHIHIJIRITSUTEKI
          pos = Ve::PartOfSpeech::Verb
          grammar = :nominal
        end
      when SETTOUSHI
        # TODO: elaborate this when we have the "main part" feature for words?
        pos = Ve::PartOfSpeech::Prefix
      when JODOUSHI
        pos = Ve::PartOfSpeech::Postposition

        if (previous.nil? || (!previous.nil? && previous[:pos2] != KAKARIJOSHI)) &&
           [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU, TOKUSHU_NU].include?(token[:inflection_type])
          attach_to_previous = true
        elsif token[:inflection_type] == FUHENKAGATA && token[:lemma] == NN
          attach_to_previous = true
        elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
          pos = Ve::PartOfSpeech::Verb
        end
      when DOUSHI
        pos = Ve::PartOfSpeech::Verb
        if token[:pos2] == SETSUBI
          attach_to_previous = true
        elsif token[:pos2] == HIJIRITSU && token[:inflection_form] != MEIREI_I
          attach_to_previous = true
        end
      when KEIYOUSHI
        pos = Ve::PartOfSpeech::Adjective
      when JOSHI
        pos = Ve::PartOfSpeech::Postposition
        if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal])
          attach_to_previous = true
        end
      when RENTAISHI
        pos = Ve::PartOfSpeech::Determiner
      when SETSUZOKUSHI
        pos = Ve::PartOfSpeech::Conjunction
      when FUKUSHI
        pos = Ve::PartOfSpeech::Adverb
      when KIGOU
        pos = Ve::PartOfSpeech::Symbol
      when FIRAA, KANDOUSHI
        pos = Ve::PartOfSpeech::Interjection
      when SONOTA
        pos = Ve::PartOfSpeech::Other
      else
        # C'est une catastrophe
      end

      if attach_to_previous && words.length > 0
        words[-1].tokens << token
        words[-1].word << token[:literal]
        words[-1].extra[:reading] << (token[:reading] || '')
        words[-1].extra[:transcription] << (token[:hatsuon] || '')
        words[-1].lemma << token[:lemma] if also_attach_to_lemma
        words[-1].part_of_speech = pos if update_pos
      else
        pos = Ve::PartOfSpeech::TBD if pos.nil?
        word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
          :reading => token[:reading] || '',
          :transcription => token[:hatsuon] || '',
          :grammar => grammar
        }, {
          :reading_script => :kata,
          :transcription_script => :kata
        })

        if eat_next
          following = tokens.next
          word.tokens << following
          word.word << following[:literal]
          word.extra[:reading] << following[:reading]
          word.extra[:transcription] << following[:hatsuon]
          word.lemma << following[:lemma] if eat_lemma
        end

        words << word
      end
      
      previous = token
    end
  rescue StopIteration
  end

  return words
end