Module: Splitta::WordTokenizer

Included in:
Frag
Defined in:
lib/splitta/word_tokenizer.rb

Constant Summary collapse

TOKENIZE_REGEXPS =
[
  /
    '' |
    `` |
    “  |
    ”
  /ux,                         '"',

  /(^|\s)(')/,                 '\1\2 ',

  /
    (?=[\("`{\[:;&#*@])(.) |
    (.)(?=[?!\)";}\]*:@']) |
    (?=[\)}\]])(.) |
    (.)(?=[({\[]) |
    ((^|\s)-)(?=[^-])
  /ux,                         '\1 ',

  /([^-])(--+)([^-])/,         '\1 \2 \3',
  /(\s|^)(,)(?=(^\s))/u,       '\1\2 ',
  /(.)(,)(\s|$)/u,              '\1 \2\3',
  /\.\s\.\s\./u,               '...',
  /M[d|D]./,                   '\1',
  /([A-Za-z]\.)(\d+)/,         '\1 \2',
  /([^\.]|^)(\.{2,})(.?)/,     '\1 \2 \3',
  /(^|\s)(\.{2,})([^\.\s])/u,  '\1\2 \3',
  /(^|\s)(\.{2,})([^\.\s])/u,  '\1 \2\3',
  /(\d)%/,                     '\1 %',
  /\$(\.?\d)/,                 '$ \1',
  /(\w)& (\w)/,                '\1&\2',
  /(\w\w+)&(\w\w+)/,           '\1 & \2',
  /n \'t( |$)/,                ' n\'t\1',
  /N \'T( |$)/,                ' N\'T\1',
  /([Cc])annot/,               '\1an not',
  /\s+/,                       ' ',
]

Instance Method Summary collapse

Instance Method Details

#tokenize(text) ⇒ Object

Tokenize a string using the rules above


49
50
51
52
53
54
55
# File 'lib/splitta/word_tokenizer.rb', line 49

def tokenize(text)
  text = text.dup
  TOKENIZE_REGEXPS.each_slice(2) do |regexp, repl|
    text.gsub!(regexp, repl)
  end
  text
end