Module: PragmaticSegmenter::Cleaner::Rules

Included in:
PragmaticSegmenter::Cleaner
Defined in:
lib/pragmatic_segmenter/cleaner/rules.rb

Defined Under Namespace

Modules: HTML, PDF

Constant Summary collapse

NewLineInMiddleOfWordRule =
Rule.new(/\n(?=[a-zA-Z]{1,2}\n)/, '')
DoubleNewLineWithSpaceRule =
Rule.new(/\n \n/, "\r")
DoubleNewLineRule =
Rule.new(/\n\n/, "\r")
NewLineFollowedByPeriodRule =
Rule.new(/\n(?=\.(\s|\n))/, '')
ReplaceNewlineWithCarriageReturnRule =
Rule.new(/\n/, "\r")
EscapedNewLineRule =
Rule.new(/\\n/, "\n")
EscapedCarriageReturnRule =
Rule.new(/\\r/, "\r")
TypoEscapedNewLineRule =
Rule.new(/\\\ n/, "\n")
TypoEscapedCarriageReturnRule =
Rule.new(/\\\ r/, "\r")
InlineFormattingRule =
Rule.new(/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}/, '')
TableOfContentsRule =
Rule.new(/\.{5,}\s*\d+-*\d*/, "\r")
ConsecutivePeriodsRule =
Rule.new(/\.{5,}/, ' ')
ConsecutiveForwardSlashRule =
Rule.new(/\/{3}/, '')
NO_SPACE_BETWEEN_SENTENCES_REGEX =
/(?<=[a-z])\.(?=[A-Z])/
NoSpaceBetweenSentencesRule =
Rule.new(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX =
/(?<=\d)\.(?=[A-Z])/
NoSpaceBetweenSentencesDigitRule =
Rule.new(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
URL_EMAIL_KEYWORDS =
['@', 'http', '.com', 'net', 'www', '//']
NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX =
/(?<=\s)\n(?=([a-z]|\())/
NewLineFollowedByBulletRule =
Rule.new(/\n(?=•)/, "\r")
QuotationsFirstRule =
Rule.new(/''/, '"')
QuotationsSecondRule =
Rule.new(/``/, '"')