Class: Suika::Tagger
- Inherits:
-
Object
- Object
- Suika::Tagger
- Defined in:
- lib/suika/tagger.rb
Overview
Tagger is a class that tokenizes Japanese text.
Instance Method Summary collapse
-
#initialize ⇒ Tagger
constructor
Create a new tagger by loading the built-in binary dictionary.
- #inspect ⇒ Object
-
#parse(sentence) ⇒ Array<String>
Parse the given sentence.
Constructor Details
#initialize ⇒ Tagger
Create a new tagger by loading the built-in binary dictionary.
26 27 28 29 30 31 32 |
# File 'lib/suika/tagger.rb', line 26 def initialize raise IOError, 'SHA1 digest of dictionary file does not match.' unless DICTIONARY_KEY == Digest::SHA1.file(DICTIONARY_PATH).to_s @sysdic = Marshal.load(Zlib::GzipReader.open(DICTIONARY_PATH, &:read)) @trie = DartsClone::DoubleArray.new @trie.set_array(@sysdic[:trie]) end |
Instance Method Details
#inspect ⇒ Object
84 85 86 |
# File 'lib/suika/tagger.rb', line 84 def inspect to_s end |
#parse(sentence) ⇒ Array<String>
Parse the given sentence.
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# File 'lib/suika/tagger.rb', line 37 def parse(sentence) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity lattice = Lattice.new(sentence.length) start = 0 terminal = sentence.length while start < terminal step = terminal - start query = sentence[start..-1] || '' result = trie.common_prefix_search(query) unless result.empty? words, indices = result unless words.empty? step = INT_MAX words.each_with_index do |word, i| features[indices[i]].each do |el| lattice.insert(start, start + word.length, word, false, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1]) end step = word.length if word.length < step # rubocop:disable Metrics/BlockNesting end end end word = sentence[start] || '' char_cate = CharDef.char_category(sentence[start] || '') char_type = CharDef.char_type(sentence[start] || '') if char_cate[:invoke] unk_terminal = start + (char_cate[:group] ? CharDef::MAX_GROUPING_SIZE : char_cate[:length]) unk_terminal = terminal if terminal < unk_terminal pos = start + 1 while pos < unk_terminal && char_type == CharDef.char_type(sentence[pos] || '') word << (sentence[pos] || '') pos += 1 end end unknowns[char_type].each do |el| lattice.insert(start, start + word.length, word, true, el[0].to_i, el[1].to_i, el[2].to_i, el[3..-1]) end step = word.length if word.length < step start += step end viterbi(lattice) end |