Class: Punkt::Token

Inherits:

Object

Object
Punkt::Token

show all

Defined in:: lib/punkt-segmenter/punkt/token.rb

Instance Attribute Summary collapse

#abbr ⇒ Object

Returns the value of attribute abbr.
#ellipsis ⇒ Object

Returns the value of attribute ellipsis.
#line_start ⇒ Object

Returns the value of attribute line_start.
#paragraph_start ⇒ Object

Returns the value of attribute paragraph_start.
#period_final ⇒ Object

Returns the value of attribute period_final.
#sentence_break ⇒ Object

Returns the value of attribute sentence_break.
#token ⇒ Object

Returns the value of attribute token.
#type ⇒ Object

Returns the value of attribute type.

Instance Method Summary collapse

Constructor Details

#initialize(token, options = {}) ⇒ `Token`

Returns a new instance of Token.

# File 'lib/punkt-segmenter/punkt/token.rb', line 8

def initialize(token, options = {})
  valid_options = [:paragraph_start, :line_start, :sentence_break, :abbr, :ellipsis]
  
  @token        = token
  @type         = UnicodeUtils.downcase(token).gsub(/^-?[\.,]?\d[\d,\.-]*\.?$/, '##number##') # numeric
  @period_final = token.end_with?('.')
  
  valid_options.each do |item|
    self.instance_variable_set(("@"+item.to_s).to_sym, nil)
  end
  options.each do |key, value|
    self.instance_variable_set(("@"+key.to_s).to_sym, value) if valid_options.include?(key)
  end
end

Instance Attribute Details

#abbr ⇒ `Object`

Returns the value of attribute abbr.



6
7
8

# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def abbr
  @abbr
end

#ellipsis ⇒ `Object`

Returns the value of attribute ellipsis.



6
7
8

# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def ellipsis
  @ellipsis
end

#line_start ⇒ `Object`

Returns the value of attribute line_start.



5
6
7

# File 'lib/punkt-segmenter/punkt/token.rb', line 5

def line_start
  @line_start
end

#paragraph_start ⇒ `Object`

Returns the value of attribute paragraph_start.



5
6
7

# File 'lib/punkt-segmenter/punkt/token.rb', line 5

def paragraph_start
  @paragraph_start
end

#period_final ⇒ `Object`

Returns the value of attribute period_final.



4
5
6

# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def period_final
  @period_final
end

#sentence_break ⇒ `Object`

Returns the value of attribute sentence_break.



6
7
8

# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def sentence_break
  @sentence_break
end

#token ⇒ `Object`

Returns the value of attribute token.



4
5
6

# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def token
  @token
end

#type ⇒ `Object`

Returns the value of attribute type.



4
5
6

# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def type
  @type
end

Instance Method Details

#ends_with_period? ⇒ `Boolean`

Returns:

(Boolean)



45
46
47

# File 'lib/punkt-segmenter/punkt/token.rb', line 45

def ends_with_period?
  @period_final
end

#first_case ⇒ `Object`

# File 'lib/punkt-segmenter/punkt/token.rb', line 39

def first_case
  return :lower if first_lower?
  return :upper if first_upper?
  return :none
end

#first_lower? ⇒ `Boolean`

Returns:

(Boolean)



35
36
37

# File 'lib/punkt-segmenter/punkt/token.rb', line 35

def first_lower?
  UnicodeUtils.lowercase_char?(@token[0])
end

#first_upper? ⇒ `Boolean`

Returns:

(Boolean)



31
32
33

# File 'lib/punkt-segmenter/punkt/token.rb', line 31

def first_upper?
  UnicodeUtils.uppercase_char?(@token[0])
end

#inspect ⇒ `Object`



77
78
79

# File 'lib/punkt-segmenter/punkt/token.rb', line 77

def inspect
  "<#{to_s}>"
end

#is_alpha? ⇒ `Boolean`

Returns:

(Boolean)



61
62
63

# File 'lib/punkt-segmenter/punkt/token.rb', line 61

def is_alpha?
  !(@token =~ /^[^\W\d]+$/).nil?
end

#is_ellipsis? ⇒ `Boolean`

Returns:

(Boolean)



49
50
51

# File 'lib/punkt-segmenter/punkt/token.rb', line 49

def is_ellipsis?
  !(@token =~ /^\.\.+$/).nil?
end

#is_initial? ⇒ `Boolean`

Returns:

(Boolean)



57
58
59

# File 'lib/punkt-segmenter/punkt/token.rb', line 57

def is_initial?
  !(@token =~ /^[^\W\d]\.$/).nil?
end

#is_non_punctuation? ⇒ `Boolean`

Returns:

(Boolean)



65
66
67

# File 'lib/punkt-segmenter/punkt/token.rb', line 65

def is_non_punctuation?
  !(@type =~ /[^\W\d]/).nil?
end

#is_number? ⇒ `Boolean`

Returns:

(Boolean)



53
54
55

# File 'lib/punkt-segmenter/punkt/token.rb', line 53

def is_number?
  @type.start_with?("##number##")
end

#to_s ⇒ `Object`

# File 'lib/punkt-segmenter/punkt/token.rb', line 69

def to_s
  result = @token
  result += '<A>' if @abbr
  result += '<E>' if @ellipsis
  result += '<S>' if @sentence_break
  result
end

#type_without_period ⇒ `Object`



23
24
25

# File 'lib/punkt-segmenter/punkt/token.rb', line 23

def type_without_period
  @type.size > 1 && @type.end_with?('.') ? @type.chop : @type
end

#type_without_sentence_period ⇒ `Object`



27
28
29

# File 'lib/punkt-segmenter/punkt/token.rb', line 27

def type_without_sentence_period
  @sentence_break ? type_without_period : @type
end

Class: Punkt::Token

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(token, options = {}) ⇒ Token

Instance Attribute Details

#abbr ⇒ Object

#ellipsis ⇒ Object

#line_start ⇒ Object

#paragraph_start ⇒ Object

#period_final ⇒ Object

#sentence_break ⇒ Object

#token ⇒ Object

#type ⇒ Object

Instance Method Details

#ends_with_period? ⇒ Boolean

#first_case ⇒ Object

#first_lower? ⇒ Boolean

#first_upper? ⇒ Boolean

#inspect ⇒ Object

#is_alpha? ⇒ Boolean

#is_ellipsis? ⇒ Boolean

#is_initial? ⇒ Boolean

#is_non_punctuation? ⇒ Boolean

#is_number? ⇒ Boolean

#to_s ⇒ Object

#type_without_period ⇒ Object

#type_without_sentence_period ⇒ Object

#initialize(token, options = {}) ⇒ `Token`

#abbr ⇒ `Object`

#ellipsis ⇒ `Object`

#line_start ⇒ `Object`

#paragraph_start ⇒ `Object`

#period_final ⇒ `Object`

#sentence_break ⇒ `Object`

#token ⇒ `Object`

#type ⇒ `Object`

#ends_with_period? ⇒ `Boolean`

#first_case ⇒ `Object`

#first_lower? ⇒ `Boolean`

#first_upper? ⇒ `Boolean`

#inspect ⇒ `Object`

#is_alpha? ⇒ `Boolean`

#is_ellipsis? ⇒ `Boolean`

#is_initial? ⇒ `Boolean`

#is_non_punctuation? ⇒ `Boolean`

#is_number? ⇒ `Boolean`

#to_s ⇒ `Object`

#type_without_period ⇒ `Object`

#type_without_sentence_period ⇒ `Object`