Class: BxBuilderChain::Vectorsearch::Pgvector

Inherits:

Object
Base
BxBuilderChain::Vectorsearch::Pgvector

Defined in:: lib/bx_builder_chain/vectorsearch/pgvector.rb

Constant Summary collapse

OPERATORS = The operators supported by the PostgreSQL vector search adapter

{
  "cosine_distance" => "cosine",
  "euclidean_distance" => "euclidean"
}

DEFAULT_OPERATOR =

"cosine_distance"

Constants inherited from Base

Base::DEFAULT_METRIC

Instance Attribute Summary collapse

#db ⇒ Object readonly

Returns the value of attribute db.
#documents_table ⇒ Object readonly

Returns the value of attribute documents_table.
#namespace_column ⇒ Object readonly

Returns the value of attribute namespace_column.
#namespaces ⇒ Object readonly

Returns the value of attribute namespaces.
#operator ⇒ Object readonly

Returns the value of attribute operator.
#table_name ⇒ Object readonly

Returns the value of attribute table_name.

Attributes inherited from Base

#client, #llm

Instance Method Summary collapse

#add_data(paths:) ⇒ Object
#add_texts(texts:, ids: nil) ⇒ Array<Integer>

Add a list of texts to the index.
#ask(question:, context_results: 4, prompt_template: nil) ⇒ String

Ask a question and return the answer.
#create_default_schema ⇒ Object
#destroy_default_schema ⇒ Object

Destroy default schema.
#documents_model ⇒ Object
#initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public']) ⇒ Pgvector constructor

A new instance of Pgvector.
#similarity_search(query:, k: 4) ⇒ Array<Hash>

Search for similar texts in the index.
#similarity_search_by_vector(embedding:, k: 4) ⇒ Array<Hash>

Search for similar texts in the index by the passed in vector.
#update_texts(texts:, ids:) ⇒ Array<Integer>

Update a list of ids and corresponding texts to the index.
#upsert_texts(texts:, ids:) ⇒ PG::Result

Upsert a list of texts to the index the added or updated texts.

Methods inherited from Base

#generate_prompt, #get_default_schema, logger_options

Methods included from DependencyHelper

#depends_on

Constructor Details

#initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public']) ⇒ `Pgvector`

Returns a new instance of Pgvector.

Parameters:

url (String) —

The URL of the PostgreSQL database
table_name (String) —

The name of the table to use for the index
llm (Object) —

The LLM client to use
namespace (String) —

The namespace to use for the index when inserting/querying

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 25

def initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public'])
  depends_on "sequel"
  require "sequel"
  
  @db = create_sequel_connection
  @table_name = "bx_builder_chain_embeddings"
  @namespace_column = "namespace"
  set_namespaces(namespaces)
  @threshold = BxBuilderChain.configuration.threshold

  validate_threshold(@threshold)

  @operator = OPERATORS[DEFAULT_OPERATOR]

  super(llm: llm)
end

Instance Attribute Details

#db ⇒ `Object` (readonly)

Returns the value of attribute db.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def db
  @db
end

#documents_table ⇒ `Object` (readonly)

Returns the value of attribute documents_table.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def documents_table
  @documents_table
end

#namespace_column ⇒ `Object` (readonly)

Returns the value of attribute namespace_column.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def namespace_column
  @namespace_column
end

#namespaces ⇒ `Object` (readonly)

Returns the value of attribute namespaces.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def namespaces
  @namespaces
end

#operator ⇒ `Object` (readonly)

Returns the value of attribute operator.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def operator
  @operator
end

#table_name ⇒ `Object` (readonly)

Returns the value of attribute table_name.



19
20
21

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 19

def table_name
  @table_name
end

Instance Method Details

#add_data(paths:) ⇒ `Object`

Raises:

(ArgumentError)

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 193

def add_data(paths:)
  raise ArgumentError, "Paths must be provided" if Array(paths).empty?

  all_added_chunk_ids = []

  @db.transaction do  # Start the transaction
    paths.each do |file_n_path|
      path, file = extract_path_and_file(file_n_path)

      texts = BxBuilderChain::Loader.new(path)&.load&.chunks.map { |chunk| chunk[:text] }
      
      texts.flatten!

      added_chunk_ids_for_current_path = add_texts(texts: texts)
      
      all_added_chunk_ids.concat(added_chunk_ids_for_current_path)

      document_record_id = @db[:bx_builder_chain_documents].insert(
                                                              name: file, 
                                                              namespace: namespaces[0],
                                                              created_at: Time.now.utc,
                                                              updated_at: Time.now.utc
                                                            )

      document_chunks_data = added_chunk_ids_for_current_path.map do |chunk_id|
        {document_id: document_record_id, embedding_id: chunk_id}
      end
      @db[:bx_builder_chain_document_chunks].multi_insert(document_chunks_data)
    end
  end  # End the transaction

  all_added_chunk_ids
end

#add_texts(texts:, ids: nil) ⇒ `Array<Integer>`

Add a list of texts to the index

Parameters:

texts (Array<String>) —

The texts to add to the index
ids (Array<String>) (defaults to: nil) —

The ids to add to the index, in the same order as the texts

Returns:

(Array<Integer>) —

The the ids of the added texts.

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 70

def add_texts(texts:, ids: nil)
  if ids.nil? || ids.empty?
    mutex = Mutex.new
    texts.each_slice(10).flat_map do |text_batch|  # Process in batches of 10
      data = []
      Async do |parent|
        text_batch.map do |text|
          parent.async do |task|
            begin
              vectorised_text = {content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
              mutex.synchronize do
                data << vectorised_text
              end
            rescue => e
              puts "Error processing text: #{e.message}"
              nil  # or some error indication
            end
          end
        end # Ensure all tasks in the batch are completed
      end.wait
      @db[@table_name.to_sym].multi_insert(data, return: :primary_key)
    end        
  else
    upsert_texts(texts: texts, ids: ids)
  end
end

#ask(question:, context_results: 4, prompt_template: nil) ⇒ `String`

Ask a question and return the answer

Parameters:

question (String) —

The question to ask

Returns:

(String) —

The answer to the question

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 180

def ask(question:, context_results: 4, prompt_template: nil)
  search_results = similarity_search(query: question, k: context_results)

  context = search_results.map do |result|
    result.content.to_s
  end
  context = context.join("\n---\n")

  prompt = generate_prompt(question: question, context: context, prompt_template: nil)

  llm.chat(prompt: prompt)
end

#create_default_schema ⇒ `Object`

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 105

def create_default_schema
  db.run "CREATE EXTENSION IF NOT EXISTS vector"
  
  namespace_column = @namespace_column
  vector_dimension = llm.default_dimension || 1000
  
  # bx_builder_chain_embeddings table
  db.create_table? :bx_builder_chain_embeddings do
    primary_key :id
    text :content
    column :vectors, "vector(#{vector_dimension})"
    text namespace_column.to_sym, default: 'public'
    
    index namespace_column.to_sym
  end

  # bx_builder_chain_documents table
  db.create_table? :bx_builder_chain_documents do
    primary_key :id
    text :name
    text namespace_column.to_sym, default: 'public'
    timestamp :created_at
    timestamp :updated_at
    
    index [:name, namespace_column.to_sym], unique: true
  end
  
  # bx_builder_chain_document_chunks table
  db.create_table? :bx_builder_chain_document_chunks do
    primary_key :id
    foreign_key :document_id, :bx_builder_chain_documents, null: false, on_delete: :cascade
    foreign_key :embedding_id, :bx_builder_chain_embeddings, null: false, on_delete: :cascade

    unique [:document_id, :embedding_id]
  end
end

#destroy_default_schema ⇒ `Object`

Destroy default schema

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 144

def destroy_default_schema
  db.drop_table? :bx_builder_chain_document_chunks
  db.drop_table? :bx_builder_chain_documents
  db.drop_table? :bx_builder_chain_embeddings
end

#documents_model ⇒ `Object`

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 42

def documents_model
  Class.new(Sequel::Model(@table_name.to_sym)) do
    plugin :pgvector, :vectors
  end
end

#similarity_search(query:, k: 4) ⇒ `Array<Hash>`

Search for similar texts in the index

Parameters:

query (String) —

The text to search for
k (Integer) (defaults to: 4) —

The number of top results to return

Returns:

(Array<Hash>) —

The results of the search

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 154

def similarity_search(query:, k: 4)
  embedding = llm.embed(text: query)

  similarity_search_by_vector(
    embedding: embedding,
    k: k
  )
end

#similarity_search_by_vector(embedding:, k: 4) ⇒ `Array<Hash>`

Search for similar texts in the index by the passed in vector. You must generate your own vector using the same LLM that generated the embeddings stored in the Vectorsearch DB.

Parameters:

embedding (Array<Float>) —

The vector to search for
k (Integer) (defaults to: 4) —

The number of top results to return

Returns:

(Array<Hash>) —

The results of the search

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 168

def similarity_search_by_vector(embedding:, k: 4)
  db.transaction do # BEGIN
    documents_model
      .nearest_neighbors(:vectors, embedding, distance: operator, threshold: @threshold)
      .where(@namespace_column.to_sym => namespaces)
      .limit(k)
  end
end

#update_texts(texts:, ids:) ⇒ `Array<Integer>`

Update a list of ids and corresponding texts to the index

Parameters:

texts (Array<String>) —

The texts to add to the index
ids (Array<String>) —

The ids to add to the index, in the same order as the texts

Returns:

(Array<Integer>) —

The ids of the updated texts.



101
102
103

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 101

def update_texts(texts:, ids:)
  upsert_texts(texts: texts, ids: ids)
end

#upsert_texts(texts:, ids:) ⇒ `PG::Result`

Upsert a list of texts to the index the added or updated texts.

Parameters:

texts (Array<String>) —

The texts to add to the index
ids (Array<Integer>) —

The ids of the objects to add to the index, in the same order as the texts

Returns:

(PG::Result) —

The response from the database including the ids of

# File 'lib/bx_builder_chain/vectorsearch/pgvector.rb', line 53

def upsert_texts(texts:, ids:)
  data = texts.zip(ids).flat_map do |(text, id)|
    {id: id, content: text, vectors: llm.embed(text: text).to_s, namespace: namespaces[0]}
  end
  # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
  @db[@table_name.to_sym]
    .insert_conflict(
      target: :id,
      update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
    )
    .multi_insert(data, return: :primary_key)
end

Class: BxBuilderChain::Vectorsearch::Pgvector

Constant Summary collapse

Constants inherited from Base

Instance Attribute Summary collapse

Attributes inherited from Base

Instance Method Summary collapse

Methods inherited from Base

Methods included from DependencyHelper

Constructor Details

#initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public']) ⇒ Pgvector

Instance Attribute Details

#db ⇒ Object (readonly)

#documents_table ⇒ Object (readonly)

#namespace_column ⇒ Object (readonly)

#namespaces ⇒ Object (readonly)

#operator ⇒ Object (readonly)

#table_name ⇒ Object (readonly)

Instance Method Details

#add_data(paths:) ⇒ Object

#add_texts(texts:, ids: nil) ⇒ Array<Integer>

#ask(question:, context_results: 4, prompt_template: nil) ⇒ String

#create_default_schema ⇒ Object

#destroy_default_schema ⇒ Object

#documents_model ⇒ Object

#similarity_search(query:, k: 4) ⇒ Array<Hash>

#similarity_search_by_vector(embedding:, k: 4) ⇒ Array<Hash>

#update_texts(texts:, ids:) ⇒ Array<Integer>

#upsert_texts(texts:, ids:) ⇒ PG::Result

#initialize(llm:, namespaces: [BxBuilderChain.configuration.public_namespace] || ['public']) ⇒ `Pgvector`

#db ⇒ `Object` (readonly)

#documents_table ⇒ `Object` (readonly)

#namespace_column ⇒ `Object` (readonly)

#namespaces ⇒ `Object` (readonly)

#operator ⇒ `Object` (readonly)

#table_name ⇒ `Object` (readonly)

#add_data(paths:) ⇒ `Object`

#add_texts(texts:, ids: nil) ⇒ `Array<Integer>`

#ask(question:, context_results: 4, prompt_template: nil) ⇒ `String`

#create_default_schema ⇒ `Object`

#destroy_default_schema ⇒ `Object`

#documents_model ⇒ `Object`

#similarity_search(query:, k: 4) ⇒ `Array<Hash>`

#similarity_search_by_vector(embedding:, k: 4) ⇒ `Array<Hash>`

#update_texts(texts:, ids:) ⇒ `Array<Integer>`

#upsert_texts(texts:, ids:) ⇒ `PG::Result`