Class: Punkt::Base

Inherits:
Object
  • Object
show all
Defined in:
lib/punkt-segmenter/punkt/base.rb

Direct Known Subclasses

SentenceTokenizer, Trainer

Instance Method Summary collapse

Constructor Details

#initialize(language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token, parameters = Punkt::Parameters.new) ⇒ Base

Returns a new instance of Base.



3
4
5
6
7
8
9
10
# File 'lib/punkt-segmenter/punkt/base.rb', line 3

def initialize(language_vars = Punkt::LanguageVars.new, 
               token_class   = Punkt::Token,
               parameters    = Punkt::Parameters.new)
               
  @parameters    = parameters
  @language_vars = language_vars
  @token_class   = token_class
end

Instance Method Details

#tokenize_words(plain_text, options = {}) ⇒ Object



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# File 'lib/punkt-segmenter/punkt/base.rb', line 12

def tokenize_words(plain_text, options = {})
  return @language_vars.word_tokenize(plain_text) if options[:output] == :string
  result = []
  paragraph_start = false
  plain_text.split("\n").each do |line|
    unless line.strip.empty?
      line_tokens = @language_vars.word_tokenize(line)
      first_token = @token_class.new(line_tokens.shift, 
                       :paragraph_start => paragraph_start,
                       :line_start      => true)
      paragraph_start = false
      line_tokens.map! { |token| @token_class.new(token) }.unshift(first_token)
      
      result += line_tokens
    else
      paragraph_start = true
    end
  end
  return result
end