Module: SRX::English
- Defined in:
- lib/srx/english/word_splitter.rb,
lib/srx/english/sentence_splitter.rb
Defined Under Namespace
Classes: SentenceSplitter, WordSplitter
Constant Summary collapse
- RULES =
[["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )", nil, false], ["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false], ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false], ["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)", "[^\\p{Lu}]|I", false], ["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false], ["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false], ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false], ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false], ["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false], ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false], ["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)", nil, true], ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)", "\\p{Lu}[^\\p{Lu}]", true], ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
- BEFORE_RE =
/(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
- REGEXPS =
RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
- FIRST_CHAR =
/\A./m