Class: SimilarityService

Inherits:
Object
  • Object
show all
Defined in:
lib/utils/similarity_service.rb

Constant Summary collapse

FRENCH_STOP_WORDS =
%w(
  je tu il nous vous ils elle me te se le la les et ou mais
  que quand donc or ni car
).freeze
ENGLISH_STOP_WORDS =
%w(
  i you he we they she me him us them and or but that when so nor for
).freeze
STOP_WORDS =
(FRENCH_STOP_WORDS + ENGLISH_STOP_WORDS).freeze

Instance Method Summary collapse

Constructor Details

#initialize(input_question, document_chunks) ⇒ SimilarityService

Returns a new instance of SimilarityService.



14
15
16
17
# File 'lib/utils/similarity_service.rb', line 14

def initialize(input_question, document_chunks)
  @input_question = input_question
  @document_chunks = document_chunks
end

Instance Method Details

#jaccard_similarity(str1, str2) ⇒ Object



19
20
21
22
23
24
25
# File 'lib/utils/similarity_service.rb', line 19

def jaccard_similarity(str1, str2)
  set1 = str1.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
  set2 = str2.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set
  intersection = set1 & set2
  union = set1 | set2
  intersection.size.to_f / union.size
end

#most_similar_sentences(top_k) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/utils/similarity_service.rb', line 27

def most_similar_sentences(top_k)
  sentence_delimiters = /[\.\?!:]/
  all_sentences = @document_chunks.flat_map { |chunk| chunk.split(sentence_delimiters).map(&:strip) }

  similarities = all_sentences.map do |sentence|
    [sentence, jaccard_similarity(@input_question, sentence)]
  end

  # Sort by similarity and take the top_k
  top_sentences = similarities.sort_by { |_, similarity| -similarity }.take(top_k).map(&:first)

  top_sentences.join(' ')
end