Class: Punkt::LanguageVars

Inherits:
Object
  • Object
show all
Defined in:
lib/punkt-segmenter/punkt/language_vars.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeLanguageVars

Returns a new instance of LanguageVars.



9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 9

def initialize
  @sent_end_chars = ['.', '?', '!']

  @re_sent_end_chars = /[.?!]/

  @internal_punctuation = [',', ':', ';']

  @re_boundary_realignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m

  @re_word_start = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/

  @re_non_word_chars = /(?:[?!)\";}\]\*:@\'\({\[])/

  @re_multi_char_punct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/

  @re_word_tokenizer = /#{@re_multi_char_punct}|(?=#{@re_word_start})\S+?(?=\s|$|#{@re_non_word_chars}|#{@re_multi_char_punct}|,(?=$|\s|#{@re_non_word_chars}|#{@re_multi_char_punct}))|\S/

  @re_period_context = /\S*#{@re_sent_end_chars}(?=(?<after_tok>#{@re_non_word_chars}|\s+(?<next_tok>\S+)))/
end

Instance Attribute Details

#internal_punctuationObject (readonly)

Returns the value of attribute internal_punctuation.



6
7
8
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 6

def internal_punctuation
  @internal_punctuation
end

#re_boundary_realignmentObject (readonly)

Returns the value of attribute re_boundary_realignment.



7
8
9
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 7

def re_boundary_realignment
  @re_boundary_realignment
end

#re_period_contextObject (readonly)

Returns the value of attribute re_period_context.



4
5
6
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 4

def re_period_context
  @re_period_context
end

#sent_end_charsObject (readonly)

Returns the value of attribute sent_end_chars.



5
6
7
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 5

def sent_end_chars
  @sent_end_chars
end

Instance Method Details

#word_tokenize(text) ⇒ Object



29
30
31
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 29

def word_tokenize(text)
  text.scan(@re_word_tokenizer)
end