Class: Punkt::SentenceTokenizer
- Defined in:
- lib/punkt-segmenter/punkt/sentence_tokenizer.rb
Class Method Summary collapse
- .sentences_text(text, sentences_indexes) ⇒ Object
- .tokenized_sentences(text, sentences_indexes) ⇒ Object
Instance Method Summary collapse
-
#initialize(train_text_or_parameters, language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token) ⇒ SentenceTokenizer
constructor
A new instance of SentenceTokenizer.
- #sentences_from_text(text, options = {}) ⇒ Object (also: #tokenize)
- #sentences_from_tokens(tokens) ⇒ Object
Methods inherited from Base
Constructor Details
#initialize(train_text_or_parameters, language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token) ⇒ SentenceTokenizer
Returns a new instance of SentenceTokenizer.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 3 def initialize(train_text_or_parameters, language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token) super(language_vars, token_class) @trainer = nil if train_text_or_parameters.kind_of?(String) @parameters = train(train_text_or_parameters) elsif train_text_or_parameters.kind_of?(Punkt::Parameters) @parameters = train_text_or_parameters else raise "You need to pass trainer parameters or a text to train." end end |
Class Method Details
.sentences_text(text, sentences_indexes) ⇒ Object
47 48 49 |
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 47 def sentences_text(text, sentences_indexes) sentences_indexes.map { |index| text[index[0]..index[1]] } end |
.tokenized_sentences(text, sentences_indexes) ⇒ Object
51 52 53 54 |
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 51 def tokenized_sentences(text, sentences_indexes) tokenizer = Punkt::Base.new() self.sentences_text(text, sentences_indexes).map { |text| tokenizer.tokenize_words(text, :output => :string) } end |
Instance Method Details
#sentences_from_text(text, options = {}) ⇒ Object Also known as: tokenize
20 21 22 23 24 25 26 |
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 20 def sentences_from_text(text, = {}) sentences = split_in_sentences(text) sentences = realign_boundaries(text, sentences) if [:realign_boundaries] sentences = self.class.send([:output], text, sentences) if [:output] return sentences end |
#sentences_from_tokens(tokens) ⇒ Object
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
# File 'lib/punkt-segmenter/punkt/sentence_tokenizer.rb', line 29 def sentences_from_tokens(tokens) tokens = annotate_tokens(tokens.map { |t| @token_class.new(t) }) sentences = [] sentence = [] tokens.each do |t| sentence << t.token if t.sentence_break sentences << sentence sentence = [] end end sentences << sentence unless sentence.empty? return sentences end |