Class: Bm25::Parser
- Inherits:
-
Object
- Object
- Bm25::Parser
- Defined in:
- lib/bm25/parser.rb
Instance Method Summary collapse
- #create_data ⇒ Object
- #create_docs ⇒ Object
- #create_idf_map ⇒ Object
- #execute(document) ⇒ Object
- #get_dataset ⇒ Object
- #get_important_keyword(dataset) ⇒ Object
-
#initialize ⇒ Parser
constructor
A new instance of Parser.
Constructor Details
#initialize ⇒ Parser
Returns a new instance of Parser.
7 8 9 10 11 12 |
# File 'lib/bm25/parser.rb', line 7 def initialize @base_document = '' @docs = [] @idf_map = {} @all_word_length = 0 end |
Instance Method Details
#create_data ⇒ Object
14 15 16 17 18 19 |
# File 'lib/bm25/parser.rb', line 14 def create_data self.create_docs self.create_idf_map dataset = self.get_dataset return dataset end |
#create_docs ⇒ Object
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
# File 'lib/bm25/parser.rb', line 37 def create_docs nm = Natto::MeCab.new doc_list = Bm25::Utils.separate_document(@base_document) doc_list.each do |d| total_words = Bm25::Utils.separate_words(d) word_map = {} total_words.each do |w| count = 0 #単語数 count = d.scan(/#{Regexp.escape(w)}/).length if word_map[w].nil? word_map[w] = { count: count, tf: count.to_f / total_words.length } end end avarage_word_length = @all_word_length / doc_list.length @docs << { document: d, words: word_map, words_length: total_words.length, dl: total_words.length / avarage_word_length.to_f } end end |
#create_idf_map ⇒ Object
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
# File 'lib/bm25/parser.rb', line 65 def create_idf_map words = [] @docs.each do |d| d[:words].each_pair{|k, v| words << k } end words = words.uniq words.each do |word| f = 0 @docs.each{|d| f = f + 1 if d[:words][word]} idf = f === 0 ? 0 : @docs.length / f @idf_map[word] = { df: f, idf: Math.log(idf) + 1 } end end |
#execute(document) ⇒ Object
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# File 'lib/bm25/parser.rb', line 21 def execute(document) if document.length < 1 raise '文字を渡してください' end @allword_length = 0 @idf_map = {} @docs = [] @base_document = document @all_word_length = Bm25::Utils.separate_words(document).length data = self.create_data data = self.get_important_keyword(data) return data end |
#get_dataset ⇒ Object
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
# File 'lib/bm25/parser.rb', line 83 def get_dataset data = [] @docs.each do |d| new_words = [] k1 = 1.2 b = 0.75 # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ] d[:words].each_pair do |k, v| tfidf = @idf_map[k][:idf] * v[:tf] new_words << { word: k, tf: v[:tf], idf: @idf_map[k][:idf], tfidf: tfidf, bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf]) } end data << { document: d[:document], words: new_words.sort_by{|w| -w[:bm25]} } end return data end |
#get_important_keyword(dataset) ⇒ Object
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
# File 'lib/bm25/parser.rb', line 108 def get_important_keyword(dataset) word_map = {} dataset.each do |data| data[:words].each do |val| k = val[:word] bm25 = val[:bm25] if word_map[k] word_map[k] = word_map[k] + bm25 else word_map[k] = bm25 end end end return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 } end |