Module: Mongoid::FullTextSearch::ClassMethods
- Defined in:
- lib/mongoid_fulltext.rb
Instance Method Summary collapse
- #all_ngrams(str, config, bound_number_returned = true) ⇒ Object
- #create_fulltext_indexes ⇒ Object
- #fulltext_search(query_string, options = {}) ⇒ Object
- #fulltext_search_ensure_indexes(index_name, config) ⇒ Object
- #fulltext_search_in(*args) ⇒ Object
- #instantiate_mapreduce_result(result) ⇒ Object
- #instantiate_mapreduce_results(results, options) ⇒ Object
- #remove_from_ngram_index ⇒ Object
- #update_ngram_index ⇒ Object
Instance Method Details
#all_ngrams(str, config, bound_number_returned = true) ⇒ Object
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
# File 'lib/mongoid_fulltext.rb', line 188 def all_ngrams(str, config, bound_number_returned = true) return {} if str.nil? if config[:remove_accents] if defined?(UnicodeUtils) str = UnicodeUtils.nfkd(str) elsif defined?(DiacriticsFu) str = DiacriticsFu::escape(str) end end # Remove any characters that aren't in the alphabet and aren't word separators filtered_str = str.mb_chars.downcase.to_s.split('').find_all{ |ch| config[:alphabet][ch] or config[:word_separators][ch] }.join('') # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams, # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'. if bound_number_returned step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max else step_size = 1 end # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the # input string using the step size that we just computed. Let score(x,y) be the score of string x # compared with string y - assigning scores to ngrams with the square root-based scoring function # below and multiplying scores of matching ngrams together yields a score function that has the # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z) # for any string z contained in y. ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i| if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \ config[:word_separators].has_key?(filtered_str[i-1].chr)) score = Math.sqrt(1 + 1.0/filtered_str.length) else score = Math.sqrt(2.0/filtered_str.length) end {:ngram => filtered_str[i..i+config[:ngram_width]-1], :score => score} end # If an ngram appears multiple times in the query string, keep the max score ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} } if config[:index_short_prefixes] or config[:index_full_words] split_regex_def = config[:word_separators].keys.map{ |k| Regexp.escape(k) }.join split_regex = Regexp.compile("[#{split_regex_def}]") all_words = filtered_str.split(split_regex) end # Add 'short prefix' records to the array: prefixes of the string that are length (ngram_width - 1) if config[:index_short_prefixes] prefixes_seen = {} all_words.each do |word| next if word.length < config[:ngram_width]-1 prefix = word[0...config[:ngram_width]-1] if prefixes_seen[prefix].nil? and (config[:stop_words][word].nil? or word == filtered_str) ngram_array << {:ngram => prefix, :score => 1 + 1.0/filtered_str.length} prefixes_seen[prefix] = true end end end # Add records to the array of ngrams for each full word in the string that isn't a stop word if config[:index_full_words] full_words_seen = {} all_words.each do |word| if word.length > 1 and full_words_seen[word].nil? and (config[:stop_words][word].nil? or word == filtered_str) ngram_array << {:ngram => word, :score => 1 + 1.0/filtered_str.length} full_words_seen[word] = true end end end # If an ngram appears as any combination of full word, short prefix, and ngram, keep the sum of the two scores Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }] end |
#create_fulltext_indexes ⇒ Object
63 64 65 66 67 68 |
# File 'lib/mongoid_fulltext.rb', line 63 def create_fulltext_indexes return unless self.mongoid_fulltext_config self.mongoid_fulltext_config.each_pair do |index_name, fulltext_config| fulltext_search_ensure_indexes(index_name, fulltext_config) end end |
#fulltext_search(query_string, options = {}) ⇒ Object
107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
# File 'lib/mongoid_fulltext.rb', line 107 def fulltext_search(query_string, ={}) max_results = .has_key?(:max_results) ? .delete(:max_results) : 10 return_scores = .has_key?(:return_scores) ? .delete(:return_scores) : false if self.mongoid_fulltext_config.count > 1 and !.has_key?(:index) = '%s is indexed by multiple full-text indexes. You must specify one by passing an :index_name parameter' raise UnspecifiedIndexError, % self.name, caller end index_name = .has_key?(:index) ? .delete(:index) : self.mongoid_fulltext_config.keys.first # Options hash should only contain filters after this point ngrams = all_ngrams(query_string, self.mongoid_fulltext_config[index_name]) return [] if ngrams.empty? # For each ngram, construct the query we'll use to pull index documents and # get a count of the number of index documents containing that n-gram ordering = {'score' => -1} limit = self.mongoid_fulltext_config[index_name][:max_candidate_set_size] coll = collection.database[index_name] cursors = ngrams.map do |ngram| query = {'ngram' => ngram[0]} query.update(map_query_filters ) count = coll.find(query).count {:ngram => ngram, :count => count, :query => query} end.sort!{ |record1, record2| record1[:count] <=> record2[:count] } # Using the queries we just constructed and the n-gram frequency counts we # just computed, pull in about *:max_candidate_set_size* candidates by # considering the n-grams in order of increasing frequency. When we've # spent all *:max_candidate_set_size* candidates, pull the top-scoring # *max_results* candidates for each remaining n-gram. results_so_far = 0 candidates_list = cursors.map do |doc| next if doc[:count] == 0 query_result = coll.find(doc[:query]) if results_so_far >= limit query_result = query_result.sort(ordering).limit(max_results) elsif doc[:count] > limit - results_so_far query_result = query_result.sort(ordering).limit(limit - results_so_far) end results_so_far += doc[:count] ngram_score = ngrams[doc[:ngram][0]] Hash[query_result.map do |candidate| [candidate['document_id'], {:clazz => candidate['class'], :score => candidate['score'] * ngram_score}] end] end.compact # Finally, score all candidates by matching them up with other candidates that are # associated with the same document. This is similar to how you might process a # boolean AND query, except that with an AND query, you'd stop after considering # the first candidate list and matching its candidates up with candidates from other # lists, whereas here we want the search to be a little fuzzier so we'll run through # all candidate lists, removing candidates as we match them up. all_scores = [] while !candidates_list.empty? candidates = candidates_list.pop scores = candidates.map do |candidate_id, data| {:id => candidate_id, :clazz => data[:clazz], :score => data[:score] + candidates_list.map{ |others| (others.delete(candidate_id) || {:score => 0})[:score] }.sum } end all_scores.concat(scores) end all_scores.sort!{ |document1, document2| -document1[:score] <=> -document2[:score] } instantiate_mapreduce_results(all_scores[0..max_results-1], { :return_scores => return_scores }) end |
#fulltext_search_ensure_indexes(index_name, config) ⇒ Object
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# File 'lib/mongoid_fulltext.rb', line 70 def fulltext_search_ensure_indexes(index_name, config) db = collection.database coll = db[index_name] # The order of filters matters when the same index is used from two or more collections. filter_indexes = (config[:filters] || []).map do |key,value| ["filter_values.#{key}", 1] end.sort_by { |filter_index| filter_index[0] } index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes) # Since the definition of the index could have changed, we'll clean up by # removing any indexes that aren't on the exact. correct_keys = index_definition.map{ |field_def| field_def[0] } all_filter_keys = filter_indexes.map{ |field_def| field_def[0] } coll.indexes.each do |idef| keys = idef['key'].keys next if !keys.member?('ngram') all_filter_keys |= keys.find_all{ |key| key.starts_with?('filter_values.') } if keys & correct_keys != correct_keys Mongoid.logger.info "Dropping #{idef['name']} [#{keys & correct_keys} <=> #{correct_keys}]" if Mongoid.logger coll.indexes.drop(idef['key']) end end if all_filter_keys.length > filter_indexes.length filter_indexes = all_filter_keys.map {|key| [key, 1] }.sort_by { |filter_index| filter_index[0] } index_definition = [['ngram', 1], ['score', -1]].concat(filter_indexes) end Mongoid.logger.info "Ensuring fts_index on #{coll.name}: #{index_definition}" if Mongoid.logger coll.indexes.create(Hash[index_definition], { :name => 'fts_index' }) Mongoid.logger.info "Ensuring document_id index on #{coll.name}" if Mongoid.logger coll.indexes.create('document_id' => 1) # to make removes fast end |
#fulltext_search_in(*args) ⇒ Object
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/mongoid_fulltext.rb', line 17 def fulltext_search_in(*args) self.mongoid_fulltext_config = {} if self.mongoid_fulltext_config.nil? = args.last.is_a?(Hash) ? args.pop : {} if .has_key?(:index_name) index_name = [:index_name] else index_name = 'mongoid_fulltext.index_%s_%s' % [self.name.downcase, self.mongoid_fulltext_config.count] end config = { :alphabet => 'abcdefghijklmnopqrstuvwxyz0123456789 ', :word_separators => "-_ \n\t", :ngram_width => 3, :max_ngrams_to_search => 6, :apply_prefix_scoring_to_all_words => true, :index_full_words => true, :index_short_prefixes => false, :max_candidate_set_size => 1000, :remove_accents => true, :reindex_immediately => true, :stop_words => Hash[['i', 'a', 's', 't', 'me', 'my', 'we', 'he', 'it', 'am', 'is', 'be', 'do', 'an', 'if', 'or', 'as', 'of', 'at', 'by', 'to', 'up', 'in', 'on', 'no', 'so', 'our', 'you', 'him', 'his', 'she', 'her', 'its', 'who', 'are', 'was', 'has', 'had', 'did', 'the', 'and', 'but', 'for', 'out', 'off', 'why', 'how', 'all', 'any', 'few', 'nor', 'not', 'own', 'too', 'can', 'don', 'now', 'ours', 'your', 'hers', 'they', 'them', 'what', 'whom', 'this', 'that', 'were', 'been', 'have', 'does', 'with', 'into', 'from', 'down', 'over', 'then', 'once', 'here', 'when', 'both', 'each', 'more', 'most', 'some', 'such', 'only', 'same', 'than', 'very', 'will', 'just', 'yours', 'their', 'which', 'these', 'those', 'being', 'doing', 'until', 'while', 'about', 'after', 'above', 'below', 'under', 'again', 'there', 'where', 'other', 'myself', 'itself', 'theirs', 'having', 'during', 'before', 'should', 'himself', 'herself', 'because', 'against', 'between', 'through', 'further', 'yourself', 'ourselves', 'yourselves', 'themselves'].map{ |x| [x,true] }] } config.update() args = [:to_s] if args.empty? config[:ngram_fields] = args config[:alphabet] = Hash[config[:alphabet].split('').map{ |ch| [ch,ch] }] config[:word_separators] = Hash[config[:word_separators].split('').map{ |ch| [ch,ch] }] self.mongoid_fulltext_config[index_name] = config before_save(:update_ngram_index) if config[:reindex_immediately] before_destroy :remove_from_ngram_index end |
#instantiate_mapreduce_result(result) ⇒ Object
176 177 178 |
# File 'lib/mongoid_fulltext.rb', line 176 def instantiate_mapreduce_result(result) result[:clazz].constantize.find(result[:id]) end |
#instantiate_mapreduce_results(results, options) ⇒ Object
180 181 182 183 184 185 186 |
# File 'lib/mongoid_fulltext.rb', line 180 def instantiate_mapreduce_results(results, ) if ([:return_scores]) results.map { |result| [ instantiate_mapreduce_result(result), result[:score] ] }.find_all { |result| ! result[0].nil? } else results.map { |result| instantiate_mapreduce_result(result) }.compact end end |
#remove_from_ngram_index ⇒ Object
264 265 266 267 268 269 |
# File 'lib/mongoid_fulltext.rb', line 264 def remove_from_ngram_index self.mongoid_fulltext_config.each_pair do |index_name, fulltext_config| coll = collection.database[index_name] coll.find({'class' => self.name}).remove_all end end |
#update_ngram_index ⇒ Object
271 272 273 274 275 |
# File 'lib/mongoid_fulltext.rb', line 271 def update_ngram_index self.all.each do |model| model.update_ngram_index end end |