Class: Ve::Parse::MecabIpadic

Inherits:

Ve::Parse

Object
Ve::Parse
Ve::Parse::MecabIpadic

Defined in:: lib/providers/mecab_ipadic.rb

Constant Summary collapse

PARSER =

%r{^ (.+?) \t (.+) }x

MEISHI = PoS

'名詞'

KOYUUMEISHI =

'固有名詞'

DAIMEISHI =

'代名詞'

JODOUSHI =

'助動詞'

KAZU =

'数'

JOSHI =

'助詞'

SETTOUSHI =

'接頭詞'

DOUSHI =

'動詞'

KIGOU =

'記号'

FIRAA =

'フィラー'

SONOTA =

'その他'

KANDOUSHI =

'感動詞'

RENTAISHI =

'連体詞'

SETSUZOKUSHI =

'接続詞'

FUKUSHI =

'副詞'

SETSUZOKUJOSHI =

'接続助詞'

KEIYOUSHI =

'形容詞'

HIJIRITSU = Pos2 and Inflection types

'非自立'

FUKUSHIKANOU =

'副詞可能'

SAHENSETSUZOKU =

'サ変接続'

KEIYOUDOUSHIGOKAN =

'形容動詞語幹'

NAIKEIYOUSHIGOKAN =

'ナイ形容詞語幹'

JODOUSHIGOKAN =

'助動詞語幹'

FUKUSHIKA =

'副詞化'

TAIGENSETSUZOKU =

'体言接続'

RENTAIKA =

'連体化'

TOKUSHU =

'特殊'

SETSUBI =

'接尾'

SETSUZOKUSHITEKI =

'接続詞的'

DOUSHIHIJIRITSUTEKI =

'動詞非自立的'

SAHEN_SURU =

'サ変・スル'

TOKUSHU_TA =

'特殊・タ'

TOKUSHU_NAI =

'特殊・ナイ'

TOKUSHU_TAI =

'特殊・タイ'

TOKUSHU_DESU =

'特殊・デス'

TOKUSHU_DA =

'特殊・ダ'

TOKUSHU_MASU =

'特殊・マス'

NA = Etc

'な'

NI =

'に'

TE =

'て'

DE =

'で'

BA =

'ば'

Instance Attribute Summary collapse

#text ⇒ Object readonly

Returns the value of attribute text.
#tokens ⇒ Object readonly

Returns the value of attribute tokens.

Instance Method Summary collapse

#initialize(text, output) ⇒ MecabIpadic constructor

A new instance of MecabIpadic.
#sentences ⇒ Object
#words ⇒ Object

Methods inherited from Ve::Parse

#as_json

Constructor Details

#initialize(text, output) ⇒ `MecabIpadic`

Returns a new instance of MecabIpadic.

# File 'lib/providers/mecab_ipadic.rb', line 72

def initialize(text, output)
  @tokens = []
  @text = text
  position = 0
  
  output.each_with_index do |line, index|
    line.rstrip!
    token = {:raw => line}
    # Anything unparsed at the end of the text
    # This must happen before sentence splits are detected to avoid funny ordering
    if output.length > 1 && output.length == index + 1
      unparsed_md = %r{(.*? \Z\n?)}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1], :raw => ''}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
      end
    end
    
    if line =~ %r{^ EOS $}x
      token[:type] = :sentence_split
      token[:literal] = ''
    elsif md = PARSER.match(line)
      # The parsed token
      token[:type] = :parsed
      token[:literal] = md[1]
      info = md[2].split(',')
      [:pos, :pos2, :pos3, :pos4, :inflection_type, :inflection_form, :lemma, :reading, :hatsuon].each_with_index do |attr, i|
        token[attr] = info[i]
      end
      
      # Anything unparsed preceding this token
      unparsed_md = %r{(.*?) #{Regexp.quote(token[:literal])}}mx.match(text, position)
      if unparsed_md[1].length > 0
        unparsed_token = {:type => :unparsed, :literal => unparsed_md[1]}
        unparsed_token[:characters] = (position..(position+unparsed_token[:literal].length-1))
        @tokens << unparsed_token
        position += unparsed_token[:literal].length
      end
      
      token[:characters] = (position..(position+token[:literal].length-1))
      position += token[:literal].length
    else
      # C'est une catastrophe
    end

    @tokens << token
  end
end

Instance Attribute Details

#text ⇒ `Object` (readonly)

Returns the value of attribute text.



70
71
72

# File 'lib/providers/mecab_ipadic.rb', line 70

def text
  @text
end

#tokens ⇒ `Object` (readonly)

Returns the value of attribute tokens.



70
71
72

# File 'lib/providers/mecab_ipadic.rb', line 70

def tokens
  @tokens
end

Instance Method Details

#sentences ⇒ `Object`

# File 'lib/providers/mecab_ipadic.rb', line 333

def sentences
  # TODO: Sentence objects that keep track of the sentence's tokens
  sentences = []
  current = ''
  
  @tokens.each do |token|
    if token[:type] == :sentence_split
      sentences << current
      current = ''
    elsif token[:literal] == '。'
      current << token[:literal]
      sentences << current
      current = ''
    else
      current << token[:literal]
    end
  end
  
  # In case there is no :sentence_split at the end
  sentences << current if current.length > 0
  
  sentences
end

#words ⇒ `Object`

# File 'lib/providers/mecab_ipadic.rb', line 170

def words
  words = []
  tokens = @tokens.find_all { |t| t[:type] == :parsed }
  tokens = tokens.to_enum

  # This is becoming very big
  begin
    while token = tokens.next
      pos = nil
      grammar = nil
      eat_next = false
      eat_lemma = true
      attach_to_previous = false
      also_attach_to_lemma = false

      case token[:pos]
      when MEISHI
        pos = Ve::PartOfSpeech::Noun

        case token[:pos2]
        when KOYUUMEISHI
          pos = Ve::PartOfSpeech::ProperNoun
        when DAIMEISHI
          pos = Ve::PartOfSpeech::Pronoun
        when FUKUSHIKANOU, SAHENSETSUZOKU, KEIYOUDOUSHIGOKAN, NAIKEIYOUSHIGOKAN
          if tokens.more?
            following = tokens.peek
            if following[:inflection_type] == SAHEN_SURU
              pos = Ve::PartOfSpeech::Verb
              eat_next = true
            elsif following[:inflection_type] == TOKUSHU_DA
              pos = Ve::PartOfSpeech::Adjective
              if following[:inflection_form] == TAIGENSETSUZOKU
                eat_next = true
                eat_lemma = false
              end
            elsif following[:inflection_type] == TOKUSHU_NAI
              pos = Ve::PartOfSpeech::Adjective
              eat_next = true
            elsif following[:pos] == JOSHI && following[:literal] == NI
              pos = Ve::PartOfSpeech::Adverb
              eat_next = true
            end
          end
        when HIJIRITSU, TOKUSHU
          if tokens.more?
            following = tokens.peek
            case token[:pos3]
            when FUKUSHIKANOU
              if following[:pos] == JOSHI && following[:literal] == NI
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when JODOUSHIGOKAN
              if following[:inflection_type] == TOKUSHU_DA
                pos = Ve::PartOfSpeech::Verb
                grammar = :auxillary
                if following[:inflection_form] == TAIGENSETSUZOKU
                  eat_next = true
                end
              elsif following[:pos] == JOSHI && following[:pos2] == FUKUSHIKA
                pos = Ve::PartOfSpeech::Adverb
                eat_next = true
              end
            when KEIYOUDOUSHIGOKAN
              pos = Ve::PartOfSpeech::Adjective
              if (following[:inflection_type] == TOKUSHU_DA && following[:inflection_form] == TAIGENSETSUZOKU) || following[:pos2] == RENTAIKA
                eat_next = true
              end
            end
          end
        when KAZU
          # TODO: recurse and find following numbers and add to this word. Except non-numbers like 幾
          pos = Ve::PartOfSpeech::Number
          if words.length > 0 && words[-1].part_of_speech == Ve::PartOfSpeech::Number
            attach_to_previous = true
            also_attach_to_lemma = true
          end
        when SETSUBI
          # TODO: elaborate a bit?
          pos = Ve::PartOfSpeech::Suffix
        when SETSUZOKUSHITEKI
          pos = Ve::PartOfSpeech::Conjunction
        when DOUSHIHIJIRITSUTEKI
          pos = Ve::PartOfSpeech::Verb
          grammar = :nominal
        end
      when SETTOUSHI
        # TODO: elaborate this when we have the "main part" feature for words?
        pos = Ve::PartOfSpeech::Prefix
      when JODOUSHI
        pos = Ve::PartOfSpeech::Postposition

        if [TOKUSHU_TA, TOKUSHU_NAI, TOKUSHU_TAI, TOKUSHU_MASU].include?(token[:inflection_type])
          attach_to_previous = true
        elsif (token[:inflection_type] == TOKUSHU_DA || token[:inflection_type] == TOKUSHU_DESU) && token[:literal] != NA
          pos = Ve::PartOfSpeech::Verb
        end
      when DOUSHI
        pos = Ve::PartOfSpeech::Verb
        if token[:pos2] == SETSUBI
          attach_to_previous = true
        elsif token[:pos2] == HIJIRITSU
          grammar = :auxillary
        end
      when KEIYOUSHI
        pos = Ve::PartOfSpeech::Adjective
      when JOSHI
        pos = Ve::PartOfSpeech::Postposition
        if token[:pos2] == SETSUZOKUJOSHI && [TE, DE, BA].include?(token[:literal])
          attach_to_previous = true
        end
      when RENTAISHI
        pos = Ve::PartOfSpeech::Determiner
      when SETSUZOKUSHI
        pos = Ve::PartOfSpeech::Conjunction
      when FUKUSHI
        pos = Ve::PartOfSpeech::Adverb
      when KIGOU
        pos = Ve::PartOfSpeech::Symbol
      when FIRAA, KANDOUSHI
        pos = Ve::PartOfSpeech::Interjection
      when SONOTA
        pos = Ve::PartOfSpeech::Other
      else
        # C'est une catastrophe
      end

      if attach_to_previous && words.length > 0
        words[-1].tokens << token
        words[-1].word << token[:literal]
        words[-1].extra[:reading] << (token[:reading] || '')
        words[-1].extra[:transcription] << (token[:hatsuon] || '')
        words[-1].lemma << token[:lemma] if also_attach_to_lemma
      else
        pos = Ve::PartOfSpeech::TBD if pos.nil?
        word = Ve::Word.new(token[:literal], token[:lemma], pos, [token], {
          :reading => token[:reading] || '',
          :transcription => token[:hatsuon] || '',
          :grammar => grammar
        }, {
          :reading_script => :kata,
          :transcription_script => :kata
        })

        if eat_next
          following = tokens.next
          word.tokens << following
          word.word << following[:literal]
          word.extra[:reading] << following[:reading]
          word.extra[:transcription] << following[:hatsuon]
          word.lemma << following[:lemma] if eat_lemma
        end

        words << word
      end
    end
  rescue StopIteration
  end

  return words
end