Class: Punkt::SentenceTokenizer

Inherits:
Base
  • Object
show all
Defined in:
lib/punkt-segmenter/punkt/sentence_tokenizer.rb

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Base

#tokenize_words

Constructor Details

#initialize(train_text_or_parameters, language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token) ⇒ SentenceTokenizer

Returns a new instance of SentenceTokenizer.



3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 3

def initialize(train_text_or_parameters,
               language_vars = Punkt::LanguageVars.new, 
               token_class   = Punkt::Token)
               
  super(language_vars, token_class)
  
  @trainer = nil
  
  if train_text_or_parameters.kind_of?(String)
    @parameters = train(train_text_or_parameters)
  elsif train_text_or_parameters.kind_of?(Punkt::Parameters) 
    @parameters = train_text_or_parameters
  else
    raise "You need to pass trainer parameters or a text to train."
  end
end

Class Method Details

.sentences_text(text, sentences_indexes) ⇒ Object



47
48
49
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 47

def sentences_text(text, sentences_indexes)
  sentences_indexes.map { |index| text[index[0]..index[1]] }
end

.tokenized_sentences(text, sentences_indexes) ⇒ Object



51
52
53
54
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 51

def tokenized_sentences(text, sentences_indexes)
  tokenizer = Punkt::Base.new()
  self.sentences_text(text, sentences_indexes).map { |text| tokenizer.tokenize_words(text, :output => :string) }
end

Instance Method Details

#sentences_from_text(text, options = {}) ⇒ Object Also known as: tokenize



20
21
22
23
24
25
26
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 20

def sentences_from_text(text, options = {})
  sentences = split_in_sentences(text)
  sentences = realign_boundaries(text, sentences) if options[:realign_boundaries]
  sentences = self.class.send(options[:output], text, sentences) if options[:output]
  
  return sentences
end

#sentences_from_tokens(tokens) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 29

def sentences_from_tokens(tokens)
  tokens = annotate_tokens(tokens.map { |t| @token_class.new(t) })
  
  sentences = []
  sentence = []
  tokens.each do |t|
    sentence << t.token
    if t.sentence_break
      sentences << sentence
      sentence = [] 
    end
  end
  sentences << sentence unless sentence.empty?
  
  return sentences
end