Class: Tokenizer::Tokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/tokenizer/tokenizer.rb

Constant Summary collapse

FS =
Regexp.new('[[:blank:]]+')
POST =

POST = %w| ] } ‘ ` “ ) , ; : \ ! \ ? \ % ‚ „ … † ‡ ‰ ‹ ‘ ’ “ ” • – — › |

%w{! ? , : ; . )}
PRE =
%w{(}

Instance Method Summary collapse

Constructor Details

#initialize(lang = :de) ⇒ Tokenizer

Returns a new instance of Tokenizer.



16
17
18
# File 'lib/tokenizer/tokenizer.rb', line 16

def initialize(lang=:de)
  @lang = lang
end

Instance Method Details

#tokenize(str) ⇒ Object



20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/tokenizer/tokenizer.rb', line 20

def tokenize(str)
  tokens = []
  token = ''
  output = ''
  fields = str.split(FS)
  fields.each do |field|
    field.each_char do |ch|
     if POST.include?(ch)
        output << "\n#{ch}"
      elsif PRE.include?(ch)
        output << "#{ch}\n"
      else
        output << ch
      end
    end
    output << "\n"
  end
  output.split("\n")
end