Class: Ve::Parse::MecabIpadic
- Defined in:
- lib/providers/mecab_ipadic.rb
Constant Summary collapse
- PARSER =
%r{^ (.+?) \t (.+) }x
- MEISHI =
PoS
'名詞'
- KOYUUMEISHI =
'固有名詞'
- DAIMEISHI =
'代名詞'
- JODOUSHI =
'助動詞'
- KAZU =
'数'
- JOSHI =
'助詞'
- SETTOUSHI =
'接頭詞'
- DOUSHI =
'動詞'
- KIGOU =
'記号'
- FIRAA =
'フィラー'
- SONOTA =
'その他'
- KANDOUSHI =
'感動詞'
- RENTAISHI =
'連体詞'
- SETSUZOKUSHI =
'接続詞'
- FUKUSHI =
'副詞'
- SETSUZOKUJOSHI =
'接続助詞'
- KEIYOUSHI =
'形容詞'
- HIJIRITSU =
Pos2 and Inflection types
'非自立'
- FUKUSHIKANOU =
'副詞可能'
- SAHENSETSUZOKU =
'サ変接続'
- KEIYOUDOUSHIGOKAN =
'形容動詞語幹'
- NAIKEIYOUSHIGOKAN =
'ナイ形容詞語幹'
- JODOUSHIGOKAN =
'助動詞語幹'
- FUKUSHIKA =
'副詞化'
- TAIGENSETSUZOKU =
'体言接続'
- RENTAIKA =
'連体化'
- TOKUSHU =
'特殊'
- SETSUBI =
'接尾'
- SETSUZOKUSHITEKI =
'接続詞的'
- DOUSHIHIJIRITSUTEKI =
'動詞非自立的'
- SAHEN_SURU =
'サ変・スル'
- TOKUSHU_TA =
'特殊・タ'
- TOKUSHU_NAI =
'特殊・ナイ'
- TOKUSHU_TAI =
'特殊・タイ'
- TOKUSHU_DESU =
'特殊・デス'
- TOKUSHU_DA =
'特殊・ダ'
- TOKUSHU_MASU =
'特殊・マス'
- NA =
Etc
'な'
- NI =
'に'
- TE =
'て'
- DE =
'で'
- BA =
'ば'
Instance Attribute Summary collapse
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#tokens ⇒ Object
readonly
Returns the value of attribute tokens.
Instance Method Summary collapse
-
#initialize(text, output) ⇒ MecabIpadic
constructor
A new instance of MecabIpadic.
- #sentences ⇒ Object
- #words ⇒ Object
Methods inherited from Ve::Parse
Constructor Details
#initialize(text, output) ⇒ MecabIpadic
Returns a new instance of MecabIpadic.
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/providers/mecab_ipadic.rb', line 72 def initialize(text, output) @tokens = [] @text = text position = 0 output.each_with_index do |line, index| line.rstrip! token = {:raw => line} # Anything unparsed at the end of the text # This must happen before sentence splits are detected to avoid funny ordering if output.length > 1 && output.length == index + 1 unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token end end if line =~ %r{^ EOS $}x token[:type] = :sentence_split token[:literal] = '' elsif md = PARSER.match(line) # The parsed token token[:type] = :parsed token[:literal] = md[1] info = md[2].split(',') [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i| token[attr] = info[i] end # Anything unparsed preceding this token unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position) if unparsed_md[1].length > 0 unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]} unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1)) @tokens << unparsed_token position += unparsed_token[:literal].length end token[:characters] = (position..(position+token[:literal].length-1)) position += token[:literal].length else # C'est une catastrophe end @tokens << token end end |
Instance Attribute Details
#text ⇒ Object (readonly)
Returns the value of attribute text.
70 71 72 |
# File 'lib/providers/mecab_ipadic.rb', line 70 def text @text end |
#tokens ⇒ Object (readonly)
Returns the value of attribute tokens.
70 71 72 |
# File 'lib/providers/mecab_ipadic.rb', line 70 def tokens @tokens end |
Instance Method Details
#sentences ⇒ Object
333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 |
# File 'lib/providers/mecab_ipadic.rb', line 333 def sentences # TODO: Sentence objects that keep track of the sentence's tokens sentences = [] current = '' @tokens.each do |token| if token[:type] == :sentence_split sentences << current current = '' elsif token[:literal] == '。' current << token[:literal] sentences << current current = '' else current << token[:literal] end end # In case there is no :sentence_split at the end sentences << current if current.length > 0 sentences end |
#words ⇒ Object
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 |
# File 'lib/providers/mecab_ipadic.rb', line 170 def words words = [] tokens = @tokens.find_all { |t| t[:type] == :parsed } tokens = tokens.to_enum # This is becoming very big begin while token = tokens.next pos = nil grammar = nil eat_next = false eat_lemma = true attach_to_previous = false also_attach_to_lemma = false case token[:pos] when MEISHI pos = Ve::PartOfSpeech::Noun case token[:pos2] when KOYUUMEISHI pos = Ve::PartOfSpeech::ProperNoun when DAIMEISHI pos = Ve::PartOfSpeech::Pronoun when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN if tokens.more? following = tokens.peek if following[:inflection_type] == SAHEN_SURU pos = Ve::PartOfSpeech::Verb eat_next = true elsif following[:inflection_type] == TOKUSHU_DA pos = Ve::PartOfSpeech::Adjective if following[:inflection_form] == TAIGENSETSUZOKU eat_next = true eat_lemma = false end elsif following[:inflection_type] == TOKUSHU_NAI pos = Ve::PartOfSpeech::Adjective eat_next = true elsif following[:pos] == JOSHI && following[:literal] == NI pos = Ve::PartOfSpeech::Adverb eat_next = true end end when HIJIRITSU, TOKUSHU if tokens.more? following = tokens.peek case token[:pos3] when FUKUSHIKANOU if following[:pos] == JOSHI && following[:literal] == NI pos = Ve::PartOfSpeech::Adverb eat_next = true end when JODOUSHIGOKAN if following[:inflection_type] == TOKUSHU_DA pos = Ve::PartOfSpeech::Verb grammar = :auxillary if following[:inflection_form] == TAIGENSETSUZOKU eat_next = true end elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA pos = Ve::PartOfSpeech::Adverb eat_next = true end when KEIYOUDOUSHIGOKAN pos = Ve::PartOfSpeech::Adjective if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA eat_next = true end end end when KAZU # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾 pos = Ve::PartOfSpeech::Number if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number attach_to_previous = true also_attach_to_lemma = true end when SETSUBI # TODO: elaborate a bit? pos = Ve::PartOfSpeech::Suffix when SETSUZOKUSHITEKI pos = Ve::PartOfSpeech::Conjunction when DOUSHIHIJIRITSUTEKI pos = Ve::PartOfSpeech::Verb grammar = :nominal end when SETTOUSHI # TODO: elaborate this when we have the "main part" feature for words? pos = Ve::PartOfSpeech::Prefix when JODOUSHI pos = Ve::PartOfSpeech::Postposition if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type]) attach_to_previous = true elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA pos = Ve::PartOfSpeech::Verb end when DOUSHI pos = Ve::PartOfSpeech::Verb if token[:pos2] == SETSUBI attach_to_previous = true elsif token[:pos2] == HIJIRITSU grammar = :auxillary end when KEIYOUSHI pos = Ve::PartOfSpeech::Adjective when JOSHI pos = Ve::PartOfSpeech::Postposition if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal]) attach_to_previous = true end when RENTAISHI pos = Ve::PartOfSpeech::Determiner when SETSUZOKUSHI pos = Ve::PartOfSpeech::Conjunction when FUKUSHI pos = Ve::PartOfSpeech::Adverb when KIGOU pos = Ve::PartOfSpeech::Symbol when FIRAA, KANDOUSHI pos = Ve::PartOfSpeech::Interjection when SONOTA pos = Ve::PartOfSpeech::Other else # C'est une catastrophe end if attach_to_previous && words.length > 0 words[-1].tokens << token words[-1].word << token[:literal] words[-1].extra[:reading] << (token[:reading] || '') words[-1].extra[:transcription] << (token[:hatsuon] || '') words[-1].lemma << token[:lemma] if also_attach_to_lemma else pos = Ve::PartOfSpeech::TBD if pos.nil? word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], { :reading => token[:reading] || '', :transcription => token[:hatsuon] || '', :grammar => grammar }, { :reading_script => :kata, :transcription_script => :kata }) if eat_next following = tokens.next word.tokens << following word.word << following[:literal] word.extra[:reading] << following[:reading] word.extra[:transcription] << following[:hatsuon] word.lemma << following[:lemma] if eat_lemma end words << word end end rescue StopIteration end return words end |