Module: SRX::English

Defined in:
lib/srx/english/word_splitter.rb,
lib/srx/english/sentence_splitter.rb

Defined Under Namespace

Classes: SentenceSplitter, WordSplitter

Constant Summary collapse

RULES =
[["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
 nil,
 false],
["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
 "[^\\p{Lu}]|I",
 false],
["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
 nil,
 true],
["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
 "\\p{Lu}[^\\p{Lu}]",
 true],
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
BEFORE_RE =
/(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
REGEXPS =
RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
FIRST_CHAR =
/\A./m