Module: Spellr::TokenRegexps

Included in:
LineTokenizer
Defined in:
lib/spellr/token_regexps.rb

Constant Summary collapse

TITLE_CASE_RE =
Word], [Word]Word [Word]‘s [Wordn’t
/[[:upper:]][[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
UPPER_CASE_RE =
WORD
WORD]Word [WORDN’T

[WORD]‘S [WORD]’s [WORD]s

/[[:upper:]]+(?:['’][[:upper:]]+(?<!['’][Ss]))*(?:(?![[:lower:]])|(?=s(?![[:lower:]])))/.freeze
LOWER_CASE_RE =
word
word]‘s [wordn’t
/[[:lower:]]+(?:['’][[:lower:]]+(?<!['’]s))*/.freeze
OTHER_CASE_RE =

for characters in [:alpha:] that aren’t in [:lower:] or [:upper:] e.g. Arabic

/(?:[[:alpha:]](?<![[:lower:][:upper:]]))+/.freeze
TERM_RE =
Regexp.union(TITLE_CASE_RE, UPPER_CASE_RE, LOWER_CASE_RE, OTHER_CASE_RE)
NOT_EVEN_NON_WORDS_RE =

NON WORDS ####

%r{[^[:alpha:]/%#0-9\\]+}.freeze
LEFTOVER_NON_WORD_BITS_RE =

e.g. a / not starting //a-url.com

%r{[/%#\\]|\d+}.freeze
HEX_RE =
/(?:#(?:\h{6}|\h{3})|0x\h+)(?![[:alpha:]])/.freeze
SHELL_COLOR_ESCAPE_RE =
/\\(?:e|0?33)\[\d+(;\d+)*m/.freeze
BACKSLASH_ESCAPE_RE =

TODO: hex escapes e.g. xAA. TODO: language aware escapes

/\\[a-zA-Z]/.freeze
REPEATED_SINGLE_LETTERS_RE =

e.g. xxxxxxxx

/(?:([[:alpha:]])\1+)(?![[:alpha:]])/.freeze
URL_ENCODED_ENTITIES_RE =
/%[0-8A-F]{2}/.freeze
SEQUENTIAL_LETTERS_RE =

There’s got to be a better way of writing this

/a(?:b(?:c(?:d(?:e(?:f(?:g(?:h(?:i(?:j(?:k(?:l(?:m(?:n(?:o(?:p(?:q(?:r(?:s(?:t(?:u(?:v(?:w(?:x(?:yz?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?)?(?![[:alpha:]])/i.freeze
URL_SCHEME =

I didn’t want to do this myself BUT i need something to heuristically match on, and it’s difficult

%r{(?://|https?://|s?ftp://|mailto:)}.freeze
URL_USERINFO =
/[[:alnum:]]+(?::[[:alnum:]]+)?@/.freeze
URL_IP_ADDRESS =
/\d{1,3}(?:\.\d{1,3}){3}/.freeze
URL_HOSTNAME =

literal \ so that i can match on domains in regexps. no-one cares but me.

/(?:[[:alnum:]\-\\]+(?:\.[[:alnum:]\-\\]+)+|localhost|#{URL_IP_ADDRESS})/.freeze
URL_PORT =
/:\d+/.freeze
URL_PATH =
%r{/(?:[[:alnum:]=@!$&~\-/._\\]|%\h{2})*}.freeze
URL_QUERY_PART =
%r{(?:[[:alnum:]=!$\-/._\\]|%\h{2})+}.freeze
URL_QUERY =
/\?#{URL_QUERY_PART}(?:&#{URL_QUERY_PART})*/.freeze
URL_FRAGMENT =
%r{#(?:[[:alnum:]=!$&\-/.\\]|%\h{2})+}.freeze
URL_REST =

URL can be any valid hostname, it must have either a scheme, userinfo, or path it may have those and any of the others and a port, or a query or a fragment.

/#{URL_QUERY}?#{URL_FRAGMENT}?/.freeze
URL_RE =
Regexp.union(
  /#{URL_SCHEME}#{URL_USERINFO}?#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
  /#{URL_USERINFO}#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}?#{URL_REST}/,
  /#{URL_HOSTNAME}#{URL_PORT}?#{URL_PATH}#{URL_REST}/
).freeze
KEY_SENDGRID_RE =
/SG\.[\w\-]{22}\.[\w\-]{43}/.freeze
KEY_HYPERWALLET_RE =
/prg-\h{8}-\h{4}-\h{4}-\h{4}-\h{12}/.freeze
KEY_GTM_RE =
/GTM-[A-Z0-9]{7}/.freeze
KEY_SHA1 =
%r{sha1-[A-Za-z0-9=+/]{28}}.freeze
KEY_SHA512 =
%r{sha512-[A-Za-z0-9=;+/]{88}}.freeze
KEY_DATA_URL =
%r{data:[a-z/;0-9\-]+;base64,[A-Za-z0-9+/]+=*(?![[:alnum:]])}.freeze
KEY_PATTERNS_RE =
Regexp.union(
  KEY_SENDGRID_RE, KEY_HYPERWALLET_RE, KEY_GTM_RE, KEY_SHA1, KEY_SHA512, KEY_DATA_URL
)
SKIPS =
Regexp.union(
  NOT_EVEN_NON_WORDS_RE,
  SHELL_COLOR_ESCAPE_RE,
  BACKSLASH_ESCAPE_RE,
  URL_ENCODED_ENTITIES_RE,
  HEX_RE,
  URL_RE, # 2%
  KEY_PATTERNS_RE
).freeze
AFTER_KEY_SKIPS =
Regexp.union(
  LEFTOVER_NON_WORD_BITS_RE,
  REPEATED_SINGLE_LETTERS_RE,
  SEQUENTIAL_LETTERS_RE
)
ALPHA_SEP_RE =
%r{[A-Za-z][A-Za-z\-_/+]*}.freeze
NUM_SEP_RE =
%r{\d[\d\-_/+]*}.freeze
THREE_CHUNK_RE =
Regexp.union(
  /\A#{ALPHA_SEP_RE}#{NUM_SEP_RE}#{ALPHA_SEP_RE}/,
  /\A#{NUM_SEP_RE}#{ALPHA_SEP_RE}#{NUM_SEP_RE}/
).freeze
POSSIBLE_KEY_RE =
%r{#{THREE_CHUNK_RE}[A-Za-z0-9+/\-_]*=*(?![[:alnum:]])}.freeze
SPELLR_DISABLE_RE =
/spellr:disable/.freeze
SPELLR_ENABLE_RE =
/spellr:enable/.freeze
SPELLR_LINE_DISABLE_RE =
/spellr:disable[-:]line/.freeze

Instance Method Summary collapse

Instance Method Details

#min_alpha_reObject

this is in a method because the minimum word length stuff was throwing it off TODO: move to config maybe?



85
86
87
88
89
90
91
# File 'lib/spellr/token_regexps.rb', line 85

def min_alpha_re
  @min_alpha_re ||= Regexp.union(
    /[A-Z][a-z]{#{Spellr.config.word_minimum_length - 1}}/,
    /[a-z]{#{Spellr.config.word_minimum_length}}/,
    /[A-Z]{#{Spellr.config.word_minimum_length}}/
  ).freeze
end