Module: Spellchecker::DetectDuplicate

Defined in:
lib/spellchecker/detect_duplicate.rb

Constant Summary collapse

MIN_LENGTH =
2
SKIP_WORDS =
Set.new(
  %w[very many truly yeah much far yada yare blah
     bla etc win toco really super peri long
     had have happened good goody ever dub bye
     mommy wild that right well huge large dan tan
     yum yummy agar kori lai please mumble extremely
     highly root whoa knock check woof bounce bouncy
     million tut wow mola paw hubba histrio cha nom
     chop same extra more bang big go no pom la ah
     ha oh ew hey]
).freeze
SKIP_PHRASES =
Set.new(['try and', 'and try', 'and again', 'again and',
'hand in', 'over and', 'and over', 'more and',
'and more', 'test and', 'and test', 'after month',
'bigger and', 'and bigger', 'hours and', 'and hours',
'month after', 'and deeper', 'deeper and', 'step by',
'by step', 'and purred', 'pages of', 'and lots',
'and on', 'face to', 'louder and', 'and louder',
'and thousands', 'day by', 'years and', 'such and',
'and so', 'and such', 'one by', 'side to',
'thousands of', 'back to', 'bit by', 'years of',
'days of', 'weeks of']).freeze
SKIP_PHRASE_WORDS =
Set.new(%w[and])

Class Method Summary collapse

Class Method Details

.call(token) ⇒ Spellchecker::Mistake?

Parameters:

Returns:



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/spellchecker/detect_duplicate.rb', line 37

def call(token)
  t1 = token

  return if t1.text.length < MIN_LENGTH
  return if SKIP_WORDS.include?(t1.downcased)

  t2 = t1.next
  t3 = t2.next
  t4 = t3.next

  text, correction = find_duplicate(t1, t2, t3, t4)

  return unless text
  return if SKIP_PHRASES.include?(correction.downcase)
  return unless Dictionaries::EnglishWords.include?(t2.text)

  return if skip_phrase?(t1, t2, t3, t4)
  return if repetition?(t1, t2, t3, t4)
  return if from_to_phrase?(t1, t2, t3)
  return if quoted?(t1, t2, t3, t4)

  Mistake.new(text: text, correction: correction,
              position: token.position, type: MistakeTypes::DUPLICATE)
end

.find_duplicate(t1, t2, t3, t4) ⇒ Spellchecker::Mistake?



67
68
69
70
71
72
73
# File 'lib/spellchecker/detect_duplicate.rb', line 67

def find_duplicate(t1, t2, t3, t4)
  if t1.downcased == t2.downcased && !t2.capital? && !t2.digit?
    [[t1, t2].map(&:text).join(' '), t1.text]
  elsif [t1.downcased, t2.downcased] == [t3.downcased, t4.downcased] && !t3.capital? && !t3.digit?
    [[t1, t2, t3, t4].map(&:text).join(' '), [t1, t2].map(&:text).join(' ')]
  end
end

.from_to_phrase?(t1, t2, t3) ⇒ Boolean

Returns:

  • (Boolean)


99
100
101
# File 'lib/spellchecker/detect_duplicate.rb', line 99

def from_to_phrase?(t1, t2, t3)
  t1.prev.downcased == 'from' && t2.downcased == 'to' && t1.downcased == t3.downcased
end

.quoted?(t1, _t2, t3, t4) ⇒ Boolean

rubocop:enable Metrics/AbcSize

Returns:

  • (Boolean)


95
96
97
# File 'lib/spellchecker/detect_duplicate.rb', line 95

def quoted?(t1, _t2, t3, t4)
  t1.prev.text == '"' && (t3.text == '"' || t4.text == '"')
end

.repetition?(t1, t2, t3, t4) ⇒ Boolean

rubocop:disable Metrics/AbcSize

Returns:

  • (Boolean)


83
84
85
86
87
88
89
90
91
92
# File 'lib/spellchecker/detect_duplicate.rb', line 83

def repetition?(t1, t2, t3, t4)
  return true if t1.downcased == t3.downcased && t1.downcased == t4.next.downcased
  return true if t1.prev.downcased == t2.downcased && t2.downcased == t4.downcased
  return true if t1.prev.downcased == t1.downcased && t1.downcased == t3.downcased
  return true if t1.downcased == t2.downcased && (t1.downcased == t3.downcased ||
                                                  t1.downcased == t1.prev.downcased ||
                                                  t1.downcased == t4.downcased)

  false
end

.skip_phrase?(t1, t2, t3, t4) ⇒ Boolean

Returns:

  • (Boolean)


75
76
77
78
79
80
# File 'lib/spellchecker/detect_duplicate.rb', line 75

def skip_phrase?(t1, t2, t3, t4)
  return true if t1.downcased == t3.downcased && SKIP_PHRASE_WORDS.include?(t1.downcased)
  return true if t2.downcased == t4.downcased && SKIP_PHRASE_WORDS.include?(t2.downcased)

  false
end