Class: SimilarityService
- Inherits:
-
Object
- Object
- SimilarityService
- Defined in:
- lib/utils/similarity_service.rb
Constant Summary collapse
- FRENCH_STOP_WORDS =
%w( je tu il nous vous ils elle me te se le la les et ou mais que quand donc or ni car ).freeze
- ENGLISH_STOP_WORDS =
%w( i you he we they she me him us them and or but that when so nor for ).freeze
- STOP_WORDS =
(FRENCH_STOP_WORDS + ENGLISH_STOP_WORDS).freeze
Instance Method Summary collapse
-
#initialize(input_question, document_chunks) ⇒ SimilarityService
constructor
A new instance of SimilarityService.
- #jaccard_similarity(str1, str2) ⇒ Object
- #most_similar_sentences(top_k) ⇒ Object
Constructor Details
#initialize(input_question, document_chunks) ⇒ SimilarityService
Returns a new instance of SimilarityService.
14 15 16 17 |
# File 'lib/utils/similarity_service.rb', line 14 def initialize(input_question, document_chunks) @input_question = input_question @document_chunks = document_chunks end |
Instance Method Details
#jaccard_similarity(str1, str2) ⇒ Object
19 20 21 22 23 24 25 |
# File 'lib/utils/similarity_service.rb', line 19 def jaccard_similarity(str1, str2) set1 = str1.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set set2 = str2.downcase.split(" ").reject { |word| STOP_WORDS.include?(word) }.to_set intersection = set1 & set2 union = set1 | set2 intersection.size.to_f / union.size end |
#most_similar_sentences(top_k) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/utils/similarity_service.rb', line 27 def most_similar_sentences(top_k) sentence_delimiters = /[\.\?!:]/ all_sentences = @document_chunks.flat_map { |chunk| chunk.split(sentence_delimiters).map(&:strip) } similarities = all_sentences.map do |sentence| [sentence, jaccard_similarity(@input_question, sentence)] end # Sort by similarity and take the top_k top_sentences = similarities.sort_by { |_, similarity| -similarity }.take(top_k).map(&:first) top_sentences.join(' ') end |