Class: Suika::Tagger

Inherits:
Object
  • Object
show all
Defined in:
lib/suika/tagger.rb

Overview

Tagger is a class that tokenizes Japanese text.

Examples:

require 'suika'

tagger = Suika::Tagger.new
tagger.parse('すもももももももものうち').each { |token| puts token }

# すもも  名詞,一般,*,*,*,*,すもも,スモモ,スモモ
# も      助詞,係助詞,*,*,*,*,も,モ,モ
# もも    名詞,一般,*,*,*,*,もも,モモ,モモ
# も      助詞,係助詞,*,*,*,*,も,モ,モ
# もも    名詞,一般,*,*,*,*,もも,モモ,モモ
# の      助詞,連体化,*,*,*,*,の,ノ,ノ
# うち    名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ

Instance Method Summary collapse

Constructor Details

#initializeTagger

Create a new tagger by loading the built-in binary dictionary.

Raises:

  • (IOError)


26
27
28
29
30
31
32
# File 'lib/suika/tagger.rb', line 26

def initialize
  raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s

  @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read))
  @trie = DartsClone::DoubleArray.new
  @trie.set_array(@sysdic[:trie])
end

Instance Method Details

#inspectObject



84
85
86
# File 'lib/suika/tagger.rb', line 84

def inspect
  to_s
end

#parse(sentence) ⇒ Array<String>

Parse the given sentence.

Parameters:

  • sentence (String)

    Japanese text to be parsed.

Returns:

  • (Array<String>)


37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/suika/tagger.rb', line 37

def parse(sentence) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
  lattice = Lattice.new(sentence.length)
  start = 0
  terminal = sentence.length

  while start < terminal
    step = terminal - start

    query = sentence[start..-1] || ''
    result = trie.common_prefix_search(query)
    unless result.empty?
      words, indices = result
      unless words.empty?
        step = INT_MAX
        words.each_with_index do |word, i|
          features[indices[i]].each do |el|
            lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
          end
          step = word.length if word.length < step # rubocop:disable Metrics/BlockNesting
        end
      end
    end

    word = sentence[start] || ''
    char_cate = CharDef.char_category(sentence[start] || '')
    char_type = CharDef.char_type(sentence[start] || '')
    if char_cate[:invoke]
      unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length])
      unk_terminal = terminal if terminal < unk_terminal
      pos = start + 1
      while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '')
        word << (sentence[pos] || '')
        pos += 1
      end
    end
    unknowns[char_type].each do |el|
      lattice.insert(start, start + word.length, word, true,
                     el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1])
    end
    step = word.length if word.length < step

    start += step
  end

  viterbi(lattice)
end