Class: Hobix::Search::Simple::Searcher
- Inherits:
-
Object
- Object
- Hobix::Search::Simple::Searcher
- Defined in:
- lib/hobix/search/simple.rb
Class Method Summary collapse
-
.load(cache_file, wash = false) ⇒ Object
Serialization support.
Instance Method Summary collapse
-
#catalog(entry) ⇒ Object
Create a new dictionary and document vectors from a blog archive.
- #classifications(text) ⇒ Object
- #classify(text) ⇒ Object
- #dump ⇒ Object
- #extract_words_for_searcher(text) ⇒ Object
-
#find_words(words) ⇒ Object
Return SearchResults based on trying to find the array of
words
in our document vectors. - #has_entry?(id, mtime) ⇒ Boolean
-
#initialize(dict, document_vectors, cache_file) ⇒ Searcher
constructor
A new instance of Searcher.
Constructor Details
#initialize(dict, document_vectors, cache_file) ⇒ Searcher
Returns a new instance of Searcher.
63 64 65 66 67 |
# File 'lib/hobix/search/simple.rb', line 63 def initialize(dict, document_vectors, cache_file) @dict = dict @document_vectors = document_vectors @cache_file = cache_file end |
Class Method Details
.load(cache_file, wash = false) ⇒ Object
Serialization support. At some point we’ll need to do incremental indexing. For now, however, the following seems to work fairly effectively on 1000 entry blogs, so I’ll defer the change until later.
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# File 'lib/hobix/search/simple.rb', line 128 def Searcher.load(cache_file, wash=false) dict = document_vectors = nil modified = false loaded = false begin File.open(cache_file, "r") do |f| unless wash dict = Marshal.load(f) document_vectors = Marshal.load(f) loaded = true end end rescue ; end unless loaded dict = Dictionary.new document_vectors = {} modified = true end s = Searcher.new(dict, document_vectors, cache_file) s.dump if modified s end |
Instance Method Details
#catalog(entry) ⇒ Object
Create a new dictionary and document vectors from a blog archive
176 177 178 179 180 181 182 183 184 185 186 187 188 |
# File 'lib/hobix/search/simple.rb', line 176 def catalog(entry) unless has_entry? entry.identifier, entry.mtime vector = Vector.new vector.at = entry.mtime extract_words_for_searcher(entry.content.downcase) do |word| word_index = @dict.add_word(word, entry.classifications) if word_index vector.add_word_index(word_index) end end @document_vectors[entry.identifier] = vector end end |
#classifications(text) ⇒ Object
190 191 192 193 194 195 196 197 198 199 200 201 |
# File 'lib/hobix/search/simple.rb', line 190 def classifications(text) score = Hash.new @dict.clsf.each do |category, category_words| score[category] = 0 total = category_words.values.inject(0) {|sum, element| sum+element} extract_words_for_searcher(text) do |word| s = category_words.has_key?(word) ? category_words[word] : 0.1 score[category] += Math.log(s/total.to_f) end end score end |
#classify(text) ⇒ Object
203 204 205 |
# File 'lib/hobix/search/simple.rb', line 203 def classify(text) (classifications(text).sort_by { |a| -a[1] })[0][0] end |
#dump ⇒ Object
155 156 157 158 159 160 |
# File 'lib/hobix/search/simple.rb', line 155 def dump File.open(@cache_file, "w") do |fileInstance| Marshal.dump(@dict, fileInstance) Marshal.dump(@document_vectors, fileInstance) end end |
#extract_words_for_searcher(text) ⇒ Object
162 163 164 165 166 |
# File 'lib/hobix/search/simple.rb', line 162 def extract_words_for_searcher(text) text.scan(/[-+]?\w[\-\w:\\]{2,}/) do |word| yield word end end |
#find_words(words) ⇒ Object
Return SearchResults based on trying to find the array of words
in our document vectors
A word beginning ‘+’ must appear in the target documents A word beginning ‘-’ must not appear other words are scored. The documents with the highest scores are returned first
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/hobix/search/simple.rb', line 77 def find_words(words) search_results = SearchResults.new general = Vector.new must_match = Vector.new must_not_match = Vector.new not_found = false extract_words_for_searcher(words.join(' ')) do |word| case word[0] when ?+ word = word[1,99] vector = must_match when ?- word = word[1,99] vector = must_not_match else vector = general end index = @dict.find(word.downcase) if index vector.add_word_index(index) else not_found = true search_results.add_warning "'#{word}' does not occur in the documents" end end if (general.num_bits + must_match.num_bits).zero? search_results.add_warning "No valid search terms given" elsif not not_found res = [] @document_vectors.each do |entry, (dvec, mtime)| score = dvec.score_against(must_match, must_not_match, general) res << [ entry, score ] if score > 0 end res.sort {|a,b| b[1] <=> a[1] }.each {|name, score| search_results.add_result(name, score) } search_results.add_warning "No matches" unless search_results.contains_matches end search_results end |
#has_entry?(id, mtime) ⇒ Boolean
168 169 170 171 |
# File 'lib/hobix/search/simple.rb', line 168 def has_entry? id, mtime dvec = @document_vectors[id] return true if dvec and dvec.at.to_i >= mtime.to_i end |