Class: Analects::Tokenizer
- Inherits:
-
Object
- Object
- Analects::Tokenizer
- Defined in:
- lib/analects/tokenizer.rb
Constant Summary collapse
- ALGO =
ALGO = RMMSeg::Algorithm
RMMSeg::SimpleAlgorithm
Instance Method Summary collapse
- #cedict(fn = '/tmp/cedict.json') ⇒ Object
- #create_dict_from_cedict(chars_dic, words_dic) ⇒ Object
-
#initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic') ⇒ Tokenizer
constructor
A new instance of Tokenizer.
- #library ⇒ Object
- #tokenize(str) ⇒ Object (also: #call)
Constructor Details
#initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic') ⇒ Tokenizer
Returns a new instance of Tokenizer.
6 7 8 9 10 11 12 |
# File 'lib/analects/tokenizer.rb', line 6 def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic') unless File.exist?(chars_dic) && File.exist?(words_dic) create_dict_from_cedict( chars_dic, words_dic ) end #RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]] RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]] end |
Instance Method Details
#cedict(fn = '/tmp/cedict.json') ⇒ Object
18 19 20 21 22 23 24 25 |
# File 'lib/analects/tokenizer.rb', line 18 def cedict( fn = '/tmp/cedict.json' ) require 'json' unless File.exist?( fn ) library.cedict.retrieve File.write( fn, library.cedict.to_a.to_json ) end @cedict ||= JSON.parse IO.read( fn ) end |
#create_dict_from_cedict(chars_dic, words_dic) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/analects/tokenizer.rb', line 27 def create_dict_from_cedict(chars_dic, words_dic) words = Set.new histo = Hash.new(0) cedict.each do |c| words << c[0] words << c[1] (c[0] + c[1]).chars.each do |c| histo[c] += 1 end end File.write(words_dic, words.sort.join("\n")) File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join ) end |