Class: Bm25::Parser

Inherits:

Object

Object
Bm25::Parser

show all

Defined in:: lib/bm25/parser.rb

Instance Method Summary collapse

#create_data ⇒ Object
#create_docs ⇒ Object
#create_idf_map ⇒ Object
#execute(document) ⇒ Object
#get_dataset ⇒ Object
#get_important_keyword(dataset) ⇒ Object
#initialize ⇒ Parser constructor

A new instance of Parser.

Constructor Details

#initialize ⇒ `Parser`

Returns a new instance of Parser.

# File 'lib/bm25/parser.rb', line 7

def initialize
  @base_document = ''
  @docs = []
  @idf_map = {}
  @all_word_length = 0
end

Instance Method Details

#create_data ⇒ `Object`

# File 'lib/bm25/parser.rb', line 14

def create_data
  self.create_docs
  self.create_idf_map
  dataset = self.get_dataset
  return dataset
end

#create_docs ⇒ `Object`

# File 'lib/bm25/parser.rb', line 37

def create_docs
  nm = Natto::MeCab.new
  doc_list = Bm25::Utils.separate_document(@base_document)

  doc_list.each do |d|
    total_words = Bm25::Utils.separate_words(d)
    word_map = {}
    total_words.each do |w|
      count = 0
      #単語数
      count = d.scan(/#{Regexp.escape(w)}/).length
      if word_map[w].nil?
        word_map[w] = {
          count: count,
          tf: count.to_f / total_words.length
        }
      end
    end
    avarage_word_length = @all_word_length / doc_list.length
    @docs << {
      document: d,
      words: word_map,
      words_length: total_words.length,
      dl: total_words.length / avarage_word_length.to_f
    }
  end
end

#create_idf_map ⇒ `Object`

# File 'lib/bm25/parser.rb', line 65

def create_idf_map
  words = []
  @docs.each do |d|
    d[:words].each_pair{|k, v| words << k }
  end

  words = words.uniq
  words.each do |word|
    f = 0
    @docs.each{|d| f = f + 1 if d[:words][word]}
    idf = f === 0 ? 0 : @docs.length / f
    @idf_map[word] = {
      df: f,
      idf: Math.log(idf) + 1
    }
  end
end

#execute(document) ⇒ `Object`

# File 'lib/bm25/parser.rb', line 21

def execute(document)
  if document.length < 1
    raise '文字を渡してください'
  end
  @allword_length = 0
  @idf_map = {}
  @docs = []

  @base_document = document
  @all_word_length = Bm25::Utils.separate_words(document).length

  data = self.create_data
  data = self.get_important_keyword(data)
  return data
end

#get_dataset ⇒ `Object`

# File 'lib/bm25/parser.rb', line 83

def get_dataset
  data = []
  @docs.each do |d|
    new_words = []
    k1 = 1.2
    b = 0.75
    # [ TF(i,j) * IDF(i) * (K1 + 1) ] / [ K1 * (1 - b + (b * NDL(j)) + TF(i,j) ]
    d[:words].each_pair do |k, v|
      tfidf = @idf_map[k][:idf] * v[:tf]
      new_words << {
        word: k,
        tf: v[:tf],
        idf: @idf_map[k][:idf],
        tfidf: tfidf,
        bm25: (tfidf + (k1 + 1)) / (k1 * (1 - b + (b * d[:dl])) + v[:tf])
      }
    end
    data << {
      document: d[:document],
      words: new_words.sort_by{|w| -w[:bm25]}
    }
  end
  return data
end

#get_important_keyword(dataset) ⇒ `Object`

# File 'lib/bm25/parser.rb', line 108

def get_important_keyword(dataset)
  word_map = {}
  dataset.each do |data|
    data[:words].each do |val|
      k = val[:word]
      bm25 = val[:bm25]
      if word_map[k]
        word_map[k] = word_map[k] + bm25
      else
        word_map[k] = bm25
      end
    end
  end
  return word_map.sort {|(k1, v1), (k2, v2)| v2 <=> v1 }
end

Class: Bm25::Parser

Instance Method Summary collapse

Constructor Details

#initialize ⇒ Parser

Instance Method Details

#create_data ⇒ Object

#create_docs ⇒ Object

#create_idf_map ⇒ Object

#execute(document) ⇒ Object

#get_dataset ⇒ Object

#get_important_keyword(dataset) ⇒ Object

#initialize ⇒ `Parser`

#create_data ⇒ `Object`

#create_docs ⇒ `Object`

#create_idf_map ⇒ `Object`

#execute(document) ⇒ `Object`

#get_dataset ⇒ `Object`

#get_important_keyword(dataset) ⇒ `Object`