Class: Punkt::LanguageVars
- Inherits:
-
Object
- Object
- Punkt::LanguageVars
- Defined in:
- lib/punkt-segmenter/punkt/language_vars.rb
Instance Attribute Summary collapse
-
#internal_punctuation ⇒ Object
readonly
Returns the value of attribute internal_punctuation.
-
#re_boundary_realignment ⇒ Object
readonly
Returns the value of attribute re_boundary_realignment.
-
#re_period_context ⇒ Object
readonly
Returns the value of attribute re_period_context.
-
#sent_end_chars ⇒ Object
readonly
Returns the value of attribute sent_end_chars.
Instance Method Summary collapse
-
#initialize ⇒ LanguageVars
constructor
A new instance of LanguageVars.
- #word_tokenize(text) ⇒ Object
Constructor Details
#initialize ⇒ LanguageVars
Returns a new instance of LanguageVars.
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 9 def initialize @sent_end_chars = ['.', '?', '!'] @re_sent_end_chars = /[.?!]/ @internal_punctuation = [',', ':', ';'] @re_boundary_realignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m @re_word_start = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/ @re_non_word_chars = /(?:[?!)\";}\]\*:@\'\({\[])/ @re_multi_char_punct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/ @re_word_tokenizer = /#{@re_multi_char_punct}|(?=#{@re_word_start})\S+?(?=\s|$|#{@re_non_word_chars}|#{@re_multi_char_punct}|,(?=$|\s|#{@re_non_word_chars}|#{@re_multi_char_punct}))|\S/ @re_period_context = /\S*#{@re_sent_end_chars}(?=(?<after_tok>#{@re_non_word_chars}|\s+(?<next_tok>\S+)))/ end |
Instance Attribute Details
#internal_punctuation ⇒ Object (readonly)
Returns the value of attribute internal_punctuation.
6 7 8 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 6 def internal_punctuation @internal_punctuation end |
#re_boundary_realignment ⇒ Object (readonly)
Returns the value of attribute re_boundary_realignment.
7 8 9 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 7 def re_boundary_realignment @re_boundary_realignment end |
#re_period_context ⇒ Object (readonly)
Returns the value of attribute re_period_context.
4 5 6 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 4 def re_period_context @re_period_context end |
#sent_end_chars ⇒ Object (readonly)
Returns the value of attribute sent_end_chars.
5 6 7 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 5 def sent_end_chars @sent_end_chars end |
Instance Method Details
#word_tokenize(text) ⇒ Object
29 30 31 |
# File 'lib/punkt-segmenter/punkt/language_vars.rb', line 29 def word_tokenize(text) text.scan(@re_word_tokenizer) end |