Class: Punkt::Token

Inherits:
Object
  • Object
show all
Defined in:
lib/punkt-segmenter/punkt/token.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(token, options = {}) ⇒ Token

Returns a new instance of Token.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
# File 'lib/punkt-segmenter/punkt/token.rb', line 8

def initialize(token, options = {})
  valid_options = [:paragraph_start, :line_start, :sentence_break, :abbr, :ellipsis]
  
  @token        = token
  @type         = UnicodeUtils.downcase(token).gsub(/^-?[\.,]?\d[\d,\.-]*\.?$/, '##number##') # numeric
  @period_final = token.end_with?('.')
  
  valid_options.each do |item|
    self.instance_variable_set(("@"+item.to_s).to_sym, nil)
  end
  options.each do |key, value|
    self.instance_variable_set(("@"+key.to_s).to_sym, value) if valid_options.include?(key)
  end
end

Instance Attribute Details

#abbrObject

Returns the value of attribute abbr.



6
7
8
# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def abbr
  @abbr
end

#ellipsisObject

Returns the value of attribute ellipsis.



6
7
8
# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def ellipsis
  @ellipsis
end

#line_startObject

Returns the value of attribute line_start.



5
6
7
# File 'lib/punkt-segmenter/punkt/token.rb', line 5

def line_start
  @line_start
end

#paragraph_startObject

Returns the value of attribute paragraph_start.



5
6
7
# File 'lib/punkt-segmenter/punkt/token.rb', line 5

def paragraph_start
  @paragraph_start
end

#period_finalObject

Returns the value of attribute period_final.



4
5
6
# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def period_final
  @period_final
end

#sentence_breakObject

Returns the value of attribute sentence_break.



6
7
8
# File 'lib/punkt-segmenter/punkt/token.rb', line 6

def sentence_break
  @sentence_break
end

#tokenObject

Returns the value of attribute token.



4
5
6
# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def token
  @token
end

#typeObject

Returns the value of attribute type.



4
5
6
# File 'lib/punkt-segmenter/punkt/token.rb', line 4

def type
  @type
end

Instance Method Details

#ends_with_period?Boolean

Returns:

  • (Boolean)


45
46
47
# File 'lib/punkt-segmenter/punkt/token.rb', line 45

def ends_with_period?
  @period_final
end

#first_caseObject



39
40
41
42
43
# File 'lib/punkt-segmenter/punkt/token.rb', line 39

def first_case
  return :lower if first_lower?
  return :upper if first_upper?
  return :none
end

#first_lower?Boolean

Returns:

  • (Boolean)


35
36
37
# File 'lib/punkt-segmenter/punkt/token.rb', line 35

def first_lower?
  UnicodeUtils.lowercase_char?(@token[0])
end

#first_upper?Boolean

Returns:

  • (Boolean)


31
32
33
# File 'lib/punkt-segmenter/punkt/token.rb', line 31

def first_upper?
  UnicodeUtils.uppercase_char?(@token[0])
end

#inspectObject



77
78
79
# File 'lib/punkt-segmenter/punkt/token.rb', line 77

def inspect
  "<#{to_s}>"
end

#is_alpha?Boolean

Returns:

  • (Boolean)


61
62
63
# File 'lib/punkt-segmenter/punkt/token.rb', line 61

def is_alpha?
  !(@token =~ /^[^\W\d]+$/).nil?
end

#is_ellipsis?Boolean

Returns:

  • (Boolean)


49
50
51
# File 'lib/punkt-segmenter/punkt/token.rb', line 49

def is_ellipsis?
  !(@token =~ /^\.\.+$/).nil?
end

#is_initial?Boolean

Returns:

  • (Boolean)


57
58
59
# File 'lib/punkt-segmenter/punkt/token.rb', line 57

def is_initial?
  !(@token =~ /^[^\W\d]\.$/).nil?
end

#is_non_punctuation?Boolean

Returns:

  • (Boolean)


65
66
67
# File 'lib/punkt-segmenter/punkt/token.rb', line 65

def is_non_punctuation?
  !(@type =~ /[^\W\d]/).nil?
end

#is_number?Boolean

Returns:

  • (Boolean)


53
54
55
# File 'lib/punkt-segmenter/punkt/token.rb', line 53

def is_number?
  @type.start_with?("##number##")
end

#to_sObject



69
70
71
72
73
74
75
# File 'lib/punkt-segmenter/punkt/token.rb', line 69

def to_s
  result = @token
  result += '<A>' if @abbr
  result += '<E>' if @ellipsis
  result += '<S>' if @sentence_break
  result
end

#type_without_periodObject



23
24
25
# File 'lib/punkt-segmenter/punkt/token.rb', line 23

def type_without_period
  @type.size > 1 && @type.end_with?('.') ? @type.chop : @type
end

#type_without_sentence_periodObject



27
28
29
# File 'lib/punkt-segmenter/punkt/token.rb', line 27

def type_without_sentence_period
  @sentence_break ? type_without_period : @type
end