Class: MiniSearch::InvertedIndex

Inherits:
Object
  • Object
show all
Defined in:
lib/mini_search/inverted_index.rb

Overview

Very simple and naive in-memory search engine implements an inverted index

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(indexing_pipeline, querying_pipeline) ⇒ InvertedIndex

Returns a new instance of InvertedIndex.



9
10
11
12
13
14
15
# File 'lib/mini_search/inverted_index.rb', line 9

def initialize(indexing_pipeline, querying_pipeline)
  @indexing_pipeline = indexing_pipeline
  @querying_pipeline = querying_pipeline
  @documents = {}
  @inverted_index = {}
  @document_length_average = 0.0
end

Instance Attribute Details

#document_length_averageObject

Returns the value of attribute document_length_average.



7
8
9
# File 'lib/mini_search/inverted_index.rb', line 7

def document_length_average
  @document_length_average
end

#documentsObject

Returns the value of attribute documents.



7
8
9
# File 'lib/mini_search/inverted_index.rb', line 7

def documents
  @documents
end

#indexing_pipelineObject

Returns the value of attribute indexing_pipeline.



7
8
9
# File 'lib/mini_search/inverted_index.rb', line 7

def indexing_pipeline
  @indexing_pipeline
end

#inverted_indexObject

Returns the value of attribute inverted_index.



7
8
9
# File 'lib/mini_search/inverted_index.rb', line 7

def inverted_index
  @inverted_index
end

#querying_pipelineObject

Returns the value of attribute querying_pipeline.



7
8
9
# File 'lib/mini_search/inverted_index.rb', line 7

def querying_pipeline
  @querying_pipeline
end

Instance Method Details

#index(document) ⇒ Object

Index a document, documents are simply Hashs with at least

id: 'unique_id',
indexed_field: 'text field',
...



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/mini_search/inverted_index.rb', line 25

def index(document)
  remove(document.fetch(:id)) if @documents[document.fetch(:id)]

  terms = @indexing_pipeline.execute(document.fetch(:indexed_field))

  @documents[document.fetch(:id)] = {
    document: document,
    number_of_terms: terms.size.to_f
  }

  @inverted_index = terms.uniq.each_with_object(@inverted_index) do |term, index|
    index[term] ||= []
    index[term] << [
      document,
      { term: term, value: Tf.calculate(term, terms) }
    ]
  end

  calculate_document_length_average

  document
end

#remove(id) ⇒ Object

Removes a document by id from index and documents list



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# File 'lib/mini_search/inverted_index.rb', line 49

def remove(id)
  document = @documents.dig(id, :document)

  terms = @indexing_pipeline.execute(document.fetch(:indexed_field))

  terms.uniq.each do |term|
    @inverted_index[term] = @inverted_index[term].reject do |document, _tf|
      document.fetch(:id) == id
    end

    @inverted_index.delete(term) if @inverted_index[term].size == 0
  end

  removed_document = @documents.delete(id)

  calculate_document_length_average

  removed_document
end

#search(raw_terms, operator: 'or') ⇒ Object



69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/mini_search/inverted_index.rb', line 69

def search(raw_terms, operator: 'or')
  processed_terms = @querying_pipeline.execute(raw_terms)

  # gets the documents that matches each term
  results_by_terms = processed_terms.map do |term|
    @inverted_index[term] || []
  end

  return [] unless results_by_terms.any?

  idfs = generate_idfs(processed_terms)

  # We flat and group by document id
  any_term_matched_documents = results_by_terms.flatten(1).group_by do |document, _tf|
    document.fetch(:id)
  end

  # We select documents based on operator
  # if operator AND
  #   we select only documents that matched all terms
  # else
  #   we select everthing
  operator_specific_matched_documents = any_term_matched_documents.select do |_document_id, document_and_tfs|
    match_terms_according_operator?(document_and_tfs,
                                    processed_terms,
                                    operator)
  end

  # map to a { document:, score: } structure.
  document_and_scores = operator_specific_matched_documents.map do |document_id, document_and_tfs|
    {
      document: @documents.dig(document_id, :document),
      score: calculate_score(@documents.fetch(document_id), document_and_tfs, idfs)
    }
  end

  # sort by scores and wraps in a more convenient structure.
  documents = document_and_scores
    .sort_by { |item| -item[:score] }

  { documents: documents, idfs: idfs, processed_terms: processed_terms }
end

#sizeObject



112
113
114
# File 'lib/mini_search/inverted_index.rb', line 112

def size
  @documents.size
end

#statsObject



116
117
118
119
120
121
122
123
124
# File 'lib/mini_search/inverted_index.rb', line 116

def stats
  {
    documents: @documents.size,
    inverted_index: {
      size: @inverted_index.size,
      terms: @inverted_index.keys
    }
  }
end