Module: Awesome::Definitions::Stopwords

Included in:
Search
Defined in:
lib/awesome/definitions/stopwords.rb

Defined Under Namespace

Modules: ClassMethods

Constant Summary collapse

QUOTED_REGEX =
/("[^"]*")/
UNQUOTED_REGEX =
/"([^"]*)"/
RM_QUOTED_REGEX =
/"[^"]*"/
BEG_OPERATORS =
/^[+-]/
END_OPERATORS =
/[,+-]$/
EXCLUSION_OPERATORS =
/^[-]/
INCLUSION_OPERATORS =
/^[+]/

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.included(base) ⇒ Object



13
14
15
16
17
# File 'lib/awesome/definitions/stopwords.rb', line 13

def self.included(base)
  base.extend ClassMethods
  base.cattr_accessor :search_stopwords
  base.cattr_accessor :verbose_stopwords
end

Instance Method Details

#array_with_stopwords(txt) ⇒ Object

“+hair”, “on”, “the”, “grapes,”, “wrath”, “-end”


105
106
107
108
109
# File 'lib/awesome/definitions/stopwords.rb', line 105

def array_with_stopwords(txt)
  qa = self.query_wo_exact_phrases(txt).split
  qa.delete(',') #delete works on self (qa here), so won't work chained onto the statement above!
  qa
end

#clean_token(token) ⇒ Object



119
120
121
# File 'lib/awesome/definitions/stopwords.rb', line 119

def clean_token(token)
  token.gsub(Awesome::Definitions::Stopwords::BEG_OPERATORS, '').gsub(Awesome::Definitions::Stopwords::END_OPERATORS, '')
end

#gowords_array(txt) ⇒ Object

“+hair”, “grapes,”, “wrath”, “-end”


112
113
114
115
116
117
# File 'lib/awesome/definitions/stopwords.rb', line 112

def gowords_array(txt)
  self.gowords ||= self.array_with_stopwords(txt).map do |token|
    cleaned_token = self.clean_token(token)
    self.stopwords.include?(cleaned_token) ? nil : cleaned_token.blank? ? nil : token
  end.compact
end

#highlight_token_array(txt) ⇒ Object



53
54
55
56
57
58
# File 'lib/awesome/definitions/stopwords.rb', line 53

def highlight_token_array(txt)
  self.highlight_tokens ||= begin
    array = (self.unquoted_exact_phrases_array(txt) | self.gowords_array(txt)).sort {|a,b| b.length <=> a.length }
    remove_exclusions(array)
  end
end

#process_stopwords(txt = self.search_text) ⇒ Object

remove the stopwords from regular search terms, BUT NOT from exact phrase searches (quoted) example: txt = “+hair "in the" on the grapes, "middle fork" wrath "age of man" -end”



42
43
44
45
46
47
# File 'lib/awesome/definitions/stopwords.rb', line 42

def process_stopwords(txt = self.search_text)
  #Needs to be set so highlighting will work properly (can't match quotes)
  self.highlight_token_array(txt)
  #Now put humpty dumpty back together without the nasty stopwords, sort the tokens by length
  self.search_token_array(txt).join(" ")
end

#query_wo_exact_phrases(txt) ⇒ Object

“+hair on the grapes, wrath -end”



100
101
102
# File 'lib/awesome/definitions/stopwords.rb', line 100

def query_wo_exact_phrases(txt)
  self.query_without_exact_phrases ||= txt.gsub(Awesome::Definitions::Stopwords::QUOTED_REGEX, '')
end

#quoted_exact_phrases_array(txt) ⇒ Object

“"in the"”, “"middle fork"”, “"age of man"”


90
91
92
# File 'lib/awesome/definitions/stopwords.rb', line 90

def quoted_exact_phrases_array(txt)
  self.quoted_exact_phrases ||= self.tokenize_quot(txt) - self.tokenize_without_quot(txt) - ['']
end

#remove_exclusions(array) ⇒ Object



64
65
66
67
68
69
70
71
72
# File 'lib/awesome/definitions/stopwords.rb', line 64

def remove_exclusions(array)
  array.map do |tok|
    tok.match(Awesome::Definitions::Stopwords::EXCLUSION_OPERATORS) ?
      nil :
      tok.match(Awesome::Definitions::Stopwords::RM_QUOTED_REGEX) ?
        tok :
        tok.gsub(Awesome::Definitions::Stopwords::INCLUSION_OPERATORS, '')
  end.compact
end

#search_token_array(txt) ⇒ Object



49
50
51
# File 'lib/awesome/definitions/stopwords.rb', line 49

def search_token_array(txt)
  self.search_tokens ||= (self.quoted_exact_phrases_array(txt) | self.gowords_array(txt)).sort {|a,b| b.length <=> a.length }
end

#set_clean_search_queryObject



60
61
62
# File 'lib/awesome/definitions/stopwords.rb', line 60

def set_clean_search_query
  self.clean_search_query = self.highlight_tokens.join(" ")
end

#tokenize_quot(txt) ⇒ Object

All tokens that are quoted



75
76
77
# File 'lib/awesome/definitions/stopwords.rb', line 75

def tokenize_quot(txt)
  self.tokenize_quoted ||= txt.split(Awesome::Definitions::Stopwords::QUOTED_REGEX)
end

#tokenize_unquot(txt) ⇒ Object

All tokens that are quoted, in their unquoted form



80
81
82
# File 'lib/awesome/definitions/stopwords.rb', line 80

def tokenize_unquot(txt)
  self.tokenize_unquoted ||= txt.split(Awesome::Definitions::Stopwords::UNQUOTED_REGEX)
end

#tokenize_without_quot(txt) ⇒ Object

Remove all tokens that are quoted



85
86
87
# File 'lib/awesome/definitions/stopwords.rb', line 85

def tokenize_without_quot(txt)
  self.tokenize_without_quoted ||= txt.split(Awesome::Definitions::Stopwords::RM_QUOTED_REGEX)
end

#unquoted_exact_phrases_array(txt) ⇒ Object

“in the”, “middle fork”, “age of man”


95
96
97
# File 'lib/awesome/definitions/stopwords.rb', line 95

def unquoted_exact_phrases_array(txt)
  self.unquoted_exact_phrases ||= self.tokenize_unquot(txt) - self.tokenize_without_quot(txt) - ['']
end