Class: Analects::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/analects/tokenizer.rb

Constant Summary collapse

ALGO =

ALGO = RMMSeg::Algorithm

RMMSeg::SimpleAlgorithm

Instance Method Summary collapse

Constructor Details

#initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic') ⇒ Tokenizer

Returns a new instance of Tokenizer.



6
7
8
9
10
11
12
# File 'lib/analects/tokenizer.rb', line 6

def initialize(chars_dic = '/tmp/chars.dic', words_dic = '/tmp/words.dic')
  unless File.exist?(chars_dic) && File.exist?(words_dic)
    create_dict_from_cedict( chars_dic, words_dic )
  end
  #RMMSeg::Dictionary.dictionaries = [[:chars, chars_dic], [:words, words_dic]]
  RMMSeg::Config.dictionaries = [[chars_dic, true], [words_dic, false]]
end

Instance Method Details

#cedict(fn = '/tmp/cedict.json') ⇒ Object



18
19
20
21
22
23
24
25
# File 'lib/analects/tokenizer.rb', line 18

def cedict( fn = '/tmp/cedict.json' )
  require 'json'
  unless File.exist?( fn )
    library.cedict.retrieve
    File.write( fn, library.cedict.to_a.to_json )
  end
  @cedict ||= JSON.parse IO.read( fn )
end

#create_dict_from_cedict(chars_dic, words_dic) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/analects/tokenizer.rb', line 27

def create_dict_from_cedict(chars_dic, words_dic)
  words = Set.new
  histo = Hash.new(0)

  cedict.each do |c|
    words << c[0]
    words << c[1]
    (c[0] + c[1]).chars.each do |c|
      histo[c] += 1
    end
  end

  File.write(words_dic, words.sort.join("\n"))
  File.write(chars_dic, histo.map {|ch, cnt| "%s %d\n" % [ ch, cnt ]}.join )
end

#libraryObject



14
15
16
# File 'lib/analects/tokenizer.rb', line 14

def library
  @library ||= Analects::Library.new
end

#tokenize(str) ⇒ Object Also known as: call



43
44
45
46
47
48
49
50
51
# File 'lib/analects/tokenizer.rb', line 43

def tokenize( str )
  [].tap do |result|
    ALGO.new( str ).tap do |alg|
      until (tok = alg.next_token).nil?
        result << tok.text.force_encoding('UTF-8')
      end
    end
  end
end