Class: BxBuilderChain::Vectorsearch::Pgvector

Inherits:
Base
  • Object
show all
Defined in:
lib/bx_builder_chain/vectorsearch/pgvector.rb

Constant Summary collapse

OPERATORS =

The operators supported by the PostgreSQL vector search adapter

{
  "cosine_distance" => "cosine",
  "euclidean_distance" => "euclidean"
}
DEFAULT_OPERATOR =
"cosine_distance"

Constants inherited from Base

Base::DEFAULT_METRIC

Instance Attribute Summary collapse

Attributes inherited from Base

#client, #llm

Instance Method Summary collapse

Methods inherited from Base

#generate_prompt, #get_default_schema, logger_options

Methods included from DependencyHelper

#depends_on

Constructor Details

#initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public']) ⇒ Pgvector

Returns a new instance of Pgvector.

Parameters:

  • url (String)

    The URL of the PostgreSQL database

  • table_name (String)

    The name of the table to use for the index

  • llm (Object)

    The LLM client to use

  • namespace (String)

    The namespace to use for the index when inserting/querying



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 25

def initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public'])
  depends_on "sequel"
  require "sequel"
  
  @db = create_sequel_connection
  @table_name = "bx_builder_chain_embeddings"
  @namespace_column = "namespace"
  set_namespaces(namespaces)
  @threshold = BxBuilderChain.configuration.threshold

  validate_threshold(@threshold)

  @operator = OPERATORS[DEFAULT_OPERATOR]

  super(llm: llm)
end

Instance Attribute Details

#dbObject (readonly)

Returns the value of attribute db.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def db
  @db
end

#documents_tableObject (readonly)

Returns the value of attribute documents_table.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def documents_table
  @documents_table
end

#namespace_columnObject (readonly)

Returns the value of attribute namespace_column.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def namespace_column
  @namespace_column
end

#namespacesObject (readonly)

Returns the value of attribute namespaces.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def namespaces
  @namespaces
end

#operatorObject (readonly)

Returns the value of attribute operator.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def operator
  @operator
end

#table_nameObject (readonly)

Returns the value of attribute table_name.



19
20
21
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def table_name
  @table_name
end

Instance Method Details

#add_data(paths:) ⇒ Object

Raises:

  • (ArgumentError)


193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 193

def add_data(paths:)
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?

  all_added_chunk_ids = []

  @db.transaction do  # Start the transaction
    paths.each do |file_n_path|
      path, file = extract_path_and_file(file_n_path)

      texts = BxBuilderChain::Loader.new(path)&.load&.chunks.map { |chunk| chunk[:text] }
      
      texts.flatten!

      added_chunk_ids_for_current_path = add_texts(texts: texts)
      
      all_added_chunk_ids.concat(added_chunk_ids_for_current_path)

      document_record_id = @db[:bx_builder_chain_documents].insert(
                                                              name: file, 
                                                              namespace: namespaces[0],
                                                              created_at: Time.now.utc,
                                                              updated_at: Time.now.utc
                                                            )

      document_chunks_data = added_chunk_ids_for_current_path.map do |chunk_id|
        {document_id: document_record_id, embedding_id: chunk_id}
      end
      @db[:bx_builder_chain_document_chunks].multi_insert(document_chunks_data)
    end
  end  # End the transaction

  all_added_chunk_ids
end

#add_texts(texts:, ids: nil) ⇒ Array<Integer>

Add a list of texts to the index

Parameters:

  • texts (Array<String>)

    The texts to add to the index

  • ids (Array<String>) (defaults to: nil)

    The ids to add to the index, in the same order as the texts

Returns:

  • (Array<Integer>)

    The the ids of the added texts.



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 70

def add_texts(texts:, ids: nil)
  if ids.nil? || ids.empty?
    mutex = Mutex.new
    texts.each_slice(10).flat_map do |text_batch|  # Process in batches of 10
      data = []
      Async do |parent|
        text_batch.map do |text|
          parent.async do |task|
            begin
              vectorised_text = {content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
              mutex.synchronize do
                data << vectorised_text
              end
            rescue => e
              puts "Error processing text: #{e.message}"
              nil  # or some error indication
            end
          end
        end # Ensure all tasks in the batch are completed
      end.wait
      @db[@table_name.to_sym].multi_insert(data, return: :primary_key)
    end        
  else
    upsert_texts(texts: texts, ids: ids)
  end
end

#ask(question:, context_results: 4, prompt_template: nil) ⇒ String

Ask a question and return the answer

Parameters:

  • question (String)

    The question to ask

Returns:

  • (String)

    The answer to the question



180
181
182
183
184
185
186
187
188
189
190
191
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 180

def ask(question:, context_results: 4, prompt_template: nil)
  search_results = similarity_search(query: question, k: context_results)

  context = search_results.map do |result|
    result.content.to_s
  end
  context = context.join("\n---\n")

  prompt = generate_prompt(question: question, context: context, prompt_template: nil)

  llm.chat(prompt: prompt)
end

#create_default_schemaObject



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 105

def create_default_schema
  db.run "CREATE EXTENSION IF NOT EXISTS vector"
  
  namespace_column = @namespace_column
  vector_dimension = llm.default_dimension || 1000
  
  # bx_builder_chain_embeddings table
  db.create_table? :bx_builder_chain_embeddings do
    primary_key :id
    text :content
    column :vectors, "vector(#{vector_dimension})"
    text namespace_column.to_sym, default: 'public'
    
    index namespace_column.to_sym
  end

  # bx_builder_chain_documents table
  db.create_table? :bx_builder_chain_documents do
    primary_key :id
    text :name
    text namespace_column.to_sym, default: 'public'
    timestamp :created_at
    timestamp :updated_at
    
    index [:name, namespace_column.to_sym], unique: true
  end
  
  # bx_builder_chain_document_chunks table
  db.create_table? :bx_builder_chain_document_chunks do
    primary_key :id
    foreign_key :document_id, :bx_builder_chain_documents, null: false, on_delete: :cascade
    foreign_key :embedding_id, :bx_builder_chain_embeddings, null: false, on_delete: :cascade

    unique [:document_id, :embedding_id]
  end
end

#destroy_default_schemaObject

Destroy default schema



144
145
146
147
148
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 144

def destroy_default_schema
  db.drop_table? :bx_builder_chain_document_chunks
  db.drop_table? :bx_builder_chain_documents
  db.drop_table? :bx_builder_chain_embeddings
end

#documents_modelObject



42
43
44
45
46
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 42

def documents_model
  Class.new(Sequel::Model(@table_name.to_sym)) do
    plugin :pgvector, :vectors
  end
end

#similarity_search(query:, k: 4) ⇒ Array<Hash>

Search for similar texts in the index

Parameters:

  • query (String)

    The text to search for

  • k (Integer) (defaults to: 4)

    The number of top results to return

Returns:

  • (Array<Hash>)

    The results of the search



154
155
156
157
158
159
160
161
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 154

def similarity_search(query:, k: 4)
  embedding = llm.embed(text: query)

  similarity_search_by_vector(
    embedding: embedding,
    k: k
  )
end

#similarity_search_by_vector(embedding:, k: 4) ⇒ Array<Hash>

Search for similar texts in the index by the passed in vector. You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.

Parameters:

  • embedding (Array<Float>)

    The vector to search for

  • k (Integer) (defaults to: 4)

    The number of top results to return

Returns:

  • (Array<Hash>)

    The results of the search



168
169
170
171
172
173
174
175
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 168

def similarity_search_by_vector(embedding:, k: 4)
  db.transaction do # BEGIN
    documents_model
      .nearest_neighbors(:vectors, embedding, distance: operator, threshold: @threshold)
      .where(@namespace_column.to_sym => namespaces)
      .limit(k)
  end
end

#update_texts(texts:, ids:) ⇒ Array<Integer>

Update a list of ids and corresponding texts to the index

Parameters:

  • texts (Array<String>)

    The texts to add to the index

  • ids (Array<String>)

    The ids to add to the index, in the same order as the texts

Returns:

  • (Array<Integer>)

    The ids of the updated texts.



101
102
103
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 101

def update_texts(texts:, ids:)
  upsert_texts(texts: texts, ids: ids)
end

#upsert_texts(texts:, ids:) ⇒ PG::Result

Upsert a list of texts to the index the added or updated texts.

Parameters:

  • texts (Array<String>)

    The texts to add to the index

  • ids (Array<Integer>)

    The ids of the objects to add to the index, in the same order as the texts

Returns:

  • (PG::Result)

    The response from the database including the ids of



53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 53

def upsert_texts(texts:, ids:)
  data = texts.zip(ids).flat_map do |(text, id)|
    {id: id, content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
  end
  # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
  @db[@table_name.to_sym]
    .insert_conflict(
      target: :id,
      update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
    )
    .multi_insert(data, return: :primary_key)
end