Class: MiniSearch::InvertedIndex
- Inherits:
-
Object
- Object
- MiniSearch::InvertedIndex
- Defined in:
- lib/mini_search/inverted_index.rb
Overview
Very simple and naive in-memory search engine implements an inverted index
Instance Attribute Summary collapse
-
#document_length_average ⇒ Object
Returns the value of attribute document_length_average.
-
#documents ⇒ Object
Returns the value of attribute documents.
-
#indexing_pipeline ⇒ Object
Returns the value of attribute indexing_pipeline.
-
#inverted_index ⇒ Object
Returns the value of attribute inverted_index.
-
#querying_pipeline ⇒ Object
Returns the value of attribute querying_pipeline.
Instance Method Summary collapse
-
#index(document) ⇒ Object
Index a document, documents are simply Hashs with at least.
-
#initialize(indexing_pipeline, querying_pipeline) ⇒ InvertedIndex
constructor
A new instance of InvertedIndex.
-
#remove(id) ⇒ Object
Removes a document by id from index and documents list.
- #search(raw_terms, operator: 'or') ⇒ Object
- #size ⇒ Object
- #stats ⇒ Object
Constructor Details
#initialize(indexing_pipeline, querying_pipeline) ⇒ InvertedIndex
Returns a new instance of InvertedIndex.
9 10 11 12 13 14 15 |
# File 'lib/mini_search/inverted_index.rb', line 9 def initialize(indexing_pipeline, ) @indexing_pipeline = indexing_pipeline @querying_pipeline = @documents = {} @inverted_index = {} @document_length_average = 0.0 end |
Instance Attribute Details
#document_length_average ⇒ Object
Returns the value of attribute document_length_average.
7 8 9 |
# File 'lib/mini_search/inverted_index.rb', line 7 def document_length_average @document_length_average end |
#documents ⇒ Object
Returns the value of attribute documents.
7 8 9 |
# File 'lib/mini_search/inverted_index.rb', line 7 def documents @documents end |
#indexing_pipeline ⇒ Object
Returns the value of attribute indexing_pipeline.
7 8 9 |
# File 'lib/mini_search/inverted_index.rb', line 7 def indexing_pipeline @indexing_pipeline end |
#inverted_index ⇒ Object
Returns the value of attribute inverted_index.
7 8 9 |
# File 'lib/mini_search/inverted_index.rb', line 7 def inverted_index @inverted_index end |
#querying_pipeline ⇒ Object
Returns the value of attribute querying_pipeline.
7 8 9 |
# File 'lib/mini_search/inverted_index.rb', line 7 def @querying_pipeline end |
Instance Method Details
#index(document) ⇒ Object
Index a document, documents are simply Hashs with at least
id: 'unique_id',
indexed_field: 'text field',
...
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
# File 'lib/mini_search/inverted_index.rb', line 25 def index(document) remove(document.fetch(:id)) if @documents[document.fetch(:id)] terms = @indexing_pipeline.execute(document.fetch(:indexed_field)) @documents[document.fetch(:id)] = { document: document, number_of_terms: terms.size.to_f } @inverted_index = terms.uniq.each_with_object(@inverted_index) do |term, index| index[term] ||= [] index[term] << [ document, { term: term, value: Tf.calculate(term, terms) } ] end calculate_document_length_average document end |
#remove(id) ⇒ Object
Removes a document by id from index and documents list
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# File 'lib/mini_search/inverted_index.rb', line 49 def remove(id) document = @documents.dig(id, :document) terms = @indexing_pipeline.execute(document.fetch(:indexed_field)) terms.uniq.each do |term| @inverted_index[term] = @inverted_index[term].reject do |document, _tf| document.fetch(:id) == id end @inverted_index.delete(term) if @inverted_index[term].size == 0 end removed_document = @documents.delete(id) calculate_document_length_average removed_document end |
#search(raw_terms, operator: 'or') ⇒ Object
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
# File 'lib/mini_search/inverted_index.rb', line 69 def search(raw_terms, operator: 'or') processed_terms = @querying_pipeline.execute(raw_terms) # gets the documents that matches each term results_by_terms = processed_terms.map do |term| @inverted_index[term] || [] end return [] unless results_by_terms.any? idfs = generate_idfs(processed_terms) # We flat and group by document id any_term_matched_documents = results_by_terms.flatten(1).group_by do |document, _tf| document.fetch(:id) end # We select documents based on operator # if operator AND # we select only documents that matched all terms # else # we select everthing operator_specific_matched_documents = any_term_matched_documents.select do |_document_id, document_and_tfs| match_terms_according_operator?(document_and_tfs, processed_terms, operator) end # map to a { document:, score: } structure. document_and_scores = operator_specific_matched_documents.map do |document_id, document_and_tfs| { document: @documents.dig(document_id, :document), score: calculate_score(@documents.fetch(document_id), document_and_tfs, idfs) } end # sort by scores and wraps in a more convenient structure. documents = document_and_scores .sort_by { |item| -item[:score] } { documents: documents, idfs: idfs, processed_terms: processed_terms } end |
#size ⇒ Object
112 113 114 |
# File 'lib/mini_search/inverted_index.rb', line 112 def size @documents.size end |
#stats ⇒ Object
116 117 118 119 120 121 122 123 124 |
# File 'lib/mini_search/inverted_index.rb', line 116 def stats { documents: @documents.size, inverted_index: { size: @inverted_index.size, terms: @inverted_index.keys } } end |