Class: Punkt::Base
- Inherits:
-
Object
- Object
- Punkt::Base
- Defined in:
- lib/punkt-segmenter/punkt/base.rb
Direct Known Subclasses
Instance Method Summary collapse
-
#initialize(language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token, parameters = Punkt::Parameters.new) ⇒ Base
constructor
A new instance of Base.
- #tokenize_words(plain_text, options = {}) ⇒ Object
Constructor Details
#initialize(language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token, parameters = Punkt::Parameters.new) ⇒ Base
Returns a new instance of Base.
3 4 5 6 7 8 9 10 |
# File 'lib/punkt-segmenter/punkt/base.rb', line 3 def initialize(language_vars = Punkt::LanguageVars.new, token_class = Punkt::Token, parameters = Punkt::Parameters.new) @parameters = parameters @language_vars = language_vars @token_class = token_class end |
Instance Method Details
#tokenize_words(plain_text, options = {}) ⇒ Object
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 |
# File 'lib/punkt-segmenter/punkt/base.rb', line 12 def tokenize_words(plain_text, = {}) return @language_vars.word_tokenize(plain_text) if [:output] == :string result = [] paragraph_start = false plain_text.split("\n").each do |line| unless line.strip.empty? line_tokens = @language_vars.word_tokenize(line) first_token = @token_class.new(line_tokens.shift, :paragraph_start => paragraph_start, :line_start => true) paragraph_start = false line_tokens.map! { |token| @token_class.new(token) }.unshift(first_token) result += line_tokens else paragraph_start = true end end return result end |