Class: Punkt::Token
- Inherits:
-
Object
- Object
- Punkt::Token
- Defined in:
- lib/punkt-segmenter/punkt/token.rb
Instance Attribute Summary collapse
-
#abbr ⇒ Object
Returns the value of attribute abbr.
-
#ellipsis ⇒ Object
Returns the value of attribute ellipsis.
-
#line_start ⇒ Object
Returns the value of attribute line_start.
-
#paragraph_start ⇒ Object
Returns the value of attribute paragraph_start.
-
#period_final ⇒ Object
Returns the value of attribute period_final.
-
#sentence_break ⇒ Object
Returns the value of attribute sentence_break.
-
#token ⇒ Object
Returns the value of attribute token.
-
#type ⇒ Object
Returns the value of attribute type.
Instance Method Summary collapse
- #ends_with_period? ⇒ Boolean
- #first_case ⇒ Object
- #first_lower? ⇒ Boolean
- #first_upper? ⇒ Boolean
-
#initialize(token, options = {}) ⇒ Token
constructor
A new instance of Token.
- #inspect ⇒ Object
- #is_alpha? ⇒ Boolean
- #is_ellipsis? ⇒ Boolean
- #is_initial? ⇒ Boolean
- #is_non_punctuation? ⇒ Boolean
- #is_number? ⇒ Boolean
- #to_s ⇒ Object
- #type_without_period ⇒ Object
- #type_without_sentence_period ⇒ Object
Constructor Details
#initialize(token, options = {}) ⇒ Token
Returns a new instance of Token.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 8 def initialize(token, = {}) = [:paragraph_start, :line_start, :sentence_break, :abbr, :ellipsis] @token = token @type = UnicodeUtils.downcase(token).gsub(/^-?[\.,]?\d[\d,\.-]*\.?$/, '##number##') # numeric @period_final = token.end_with?('.') .each do |item| self.instance_variable_set(("@"+item.to_s).to_sym, nil) end .each do |key, value| self.instance_variable_set(("@"+key.to_s).to_sym, value) if .include?(key) end end |
Instance Attribute Details
#abbr ⇒ Object
Returns the value of attribute abbr.
6 7 8 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 6 def abbr @abbr end |
#ellipsis ⇒ Object
Returns the value of attribute ellipsis.
6 7 8 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 6 def ellipsis @ellipsis end |
#line_start ⇒ Object
Returns the value of attribute line_start.
5 6 7 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 5 def line_start @line_start end |
#paragraph_start ⇒ Object
Returns the value of attribute paragraph_start.
5 6 7 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 5 def paragraph_start @paragraph_start end |
#period_final ⇒ Object
Returns the value of attribute period_final.
4 5 6 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 4 def period_final @period_final end |
#sentence_break ⇒ Object
Returns the value of attribute sentence_break.
6 7 8 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 6 def sentence_break @sentence_break end |
#token ⇒ Object
Returns the value of attribute token.
4 5 6 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 4 def token @token end |
#type ⇒ Object
Returns the value of attribute type.
4 5 6 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 4 def type @type end |
Instance Method Details
#ends_with_period? ⇒ Boolean
45 46 47 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 45 def ends_with_period? @period_final end |
#first_case ⇒ Object
39 40 41 42 43 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 39 def first_case return :lower if first_lower? return :upper if first_upper? return :none end |
#first_lower? ⇒ Boolean
35 36 37 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 35 def first_lower? UnicodeUtils.lowercase_char?(@token[0]) end |
#first_upper? ⇒ Boolean
31 32 33 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 31 def first_upper? UnicodeUtils.uppercase_char?(@token[0]) end |
#inspect ⇒ Object
77 78 79 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 77 def inspect "<#{to_s}>" end |
#is_alpha? ⇒ Boolean
61 62 63 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 61 def is_alpha? !(@token =~ /^[^\W\d]+$/).nil? end |
#is_ellipsis? ⇒ Boolean
49 50 51 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 49 def is_ellipsis? !(@token =~ /^\.\.+$/).nil? end |
#is_initial? ⇒ Boolean
57 58 59 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 57 def is_initial? !(@token =~ /^[^\W\d]\.$/).nil? end |
#is_non_punctuation? ⇒ Boolean
65 66 67 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 65 def is_non_punctuation? !(@type =~ /[^\W\d]/).nil? end |
#is_number? ⇒ Boolean
53 54 55 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 53 def is_number? @type.start_with?("##number##") end |
#to_s ⇒ Object
69 70 71 72 73 74 75 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 69 def to_s result = @token result += '<A>' if @abbr result += '<E>' if @ellipsis result += '<S>' if @sentence_break result end |
#type_without_period ⇒ Object
23 24 25 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 23 def type_without_period @type.size > 1 && @type.end_with?('.') ? @type.chop : @type end |
#type_without_sentence_period ⇒ Object
27 28 29 |
# File 'lib/punkt-segmenter/punkt/token.rb', line 27 def type_without_sentence_period @sentence_break ? type_without_period : @type end |