Class: Bm25::Parser

Inherits:
Object
  • Object
show all
Defined in:
lib/bm25/parser.rb

Instance Method Summary collapse

Constructor Details

#initializeParser

Returns a new instance of Parser.



7
8
9
10
11
12
# File 'lib/bm25/parser.rb', line 7

def initialize
  @base_document = ''
  @docs = []
  @idf_map = {}
  @all_word_length = 0
end

Instance Method Details

#create_dataObject



14
15
16
17
18
19
# File 'lib/bm25/parser.rb', line 14

def create_data
  self.create_docs
  self.create_idf_map
  dataset = self.get_dataset
  return dataset
end

#create_docsObject



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# File 'lib/bm25/parser.rb', line 37

def create_docs
  nm = Natto::MeCab.new
  doc_list = Bm25::Utils.separate_document(@base_document)

  doc_list.each do |d|
    total_words = Bm25::Utils.separate_words(d)
    word_map = {}
    total_words.each do |w|
      count = 0
      #単語数
      count = d.scan(/#{Regexp.escape(w)}/).length
      if word_map[w].nil?
        word_map[w] = {
          count: count,
          tf: count.to_f / total_words.length
        }
      end
    end
    avarage_word_length = @all_word_length / doc_list.length
    @docs << {
      document: d,
      words: word_map,
      words_length: total_words.length,
      dl: total_words.length / avarage_word_length.to_f
    }
  end
end

#create_idf_mapObject



65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# File 'lib/bm25/parser.rb', line 65

def create_idf_map
  words = []
  @docs.each do |d|
    d[:words].each_pair{|k, v| words << k }
  end

  words = words.uniq
  words.each do |word|
    f = 0
    @docs.each{|d| f = f + 1 if d[:words][word]}
    idf = f === 0 ? 0 : @docs.length / f
    @idf_map[word] = {
      df: f,
      idf: Math.log(idf) + 1
    }
  end
end

#execute(document) ⇒ Object



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File 'lib/bm25/parser.rb', line 21

def execute(document)
  if document.length < 1
    raise '文字を渡してください'
  end
  @allword_length = 0
  @idf_map = {}
  @docs = []

  @base_document = document
  @all_word_length = Bm25::Utils.separate_words(document).length

  data = self.create_data
  data = self.get_important_keyword(data)
  return data
end

#get_datasetObject



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/bm25/parser.rb', line 83

def get_dataset
  data = []
  @docs.each do |d|
    new_words = []
    k1 = 1.2
    b = 0.75
    # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
    d[:words].each_pair do |k, v|
      tfidf = @idf_map[k][:idf] * v[:tf]
      new_words << {
        word: k,
        tf: v[:tf],
        idf: @idf_map[k][:idf],
        tfidf: tfidf,
        bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
      }
    end
    data << {
      document: d[:document],
      words: new_words.sort_by{|w| -w[:bm25]}
    }
  end
  return data
end

#get_important_keyword(dataset) ⇒ Object



108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# File 'lib/bm25/parser.rb', line 108

def get_important_keyword(dataset)
  word_map = {}
  dataset.each do |data|
    data[:words].each do |val|
      k = val[:word]
      bm25 = val[:bm25]
      if word_map[k]
        word_map[k] = word_map[k] + bm25
      else
        word_map[k] = bm25
      end
    end
  end
  return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
end