Class: BM25F

Inherits:
Object
  • Object
show all
Defined in:
lib/bm25f.rb

Instance Method Summary collapse

Constructor Details

#initialize(term_freq_weight: 1.33, doc_length_weight: 0.8) ⇒ BM25F

Initializes a BM25F model.

Parameters:

  • term_freq_weight (Float) (defaults to: 1.33)

    Weight for term frequency.

  • doc_length_weight (Float) (defaults to: 0.8)

    Weight for document length.



9
10
11
12
13
14
15
# File 'lib/bm25f.rb', line 9

def initialize(term_freq_weight: 1.33, doc_length_weight: 0.8)
  @term_freq_weight = term_freq_weight
  @doc_length_weight = doc_length_weight

  @tokenizer = PragmaticTokenizer::Tokenizer.new
  @stemmer = UEAStemmer.new
end

Instance Method Details

#fit(documents, field_weights = {}) ⇒ Object

Fits the model to a set of documents.

Parameters:

  • documents (Hash)

    The documents to fit the model to.

  • field_weights (Hash) (defaults to: {})

    A specified weight for each key the documents.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# File 'lib/bm25f.rb', line 21

def fit(documents, field_weights = {})
  documents = preprocess_documents(documents)

  # Set missing field_weights to 1
  unique_keys = documents.flat_map(&:keys).uniq

  unique_keys.each do |key|
    field_weights[key] = 1 unless field_weights.key?(key)
  end

  @field_weights = field_weights
  @documents = documents
  @avg_doc_length = calculate_average_document_length(documents)
  @doc_lengths = calculate_document_lengths(documents)
  @total_docs = documents.length
  @idf = calculate_idf
end

#score(query) ⇒ Hash

Calculates the score of each document using the query.

Parameters:

  • query (String)

    The query to score with.

Returns:

  • (Hash)

    A hash containing document IDs and their scores.



43
44
45
46
47
48
49
50
# File 'lib/bm25f.rb', line 43

def score(query)
  query_terms = preprocess_query(query)
  scores = {}
  (0...@total_docs).each do |doc_id|
    scores[doc_id] = calculate_document_score(doc_id, query_terms)
  end
  scores
end