Class: Cluster::Grammy

Inherits:
Object
  • Object
show all
Defined in:
lib/lite/ngrams.rb

Instance Method Summary collapse

Constructor Details

#initializeGrammy

Returns a new instance of Grammy.



4
5
6
7
8
9
# File 'lib/lite/ngrams.rb', line 4

def initialize
  @word        = Hash.new
  @word_next   = Hash.new
  @word_bigram = Hash.new
  @perms = Hash.new
end

Instance Method Details

#calculate_ngrams(depth = 5, cutoffs = [2,2,1,1,1]) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# File 'lib/lite/ngrams.rb', line 30

def calculate_ngrams( depth=5, cutoffs=[2,2,1,1,1] )    
  a = { :w => @word.delete_if{|key, value| value <= cutoffs.first } , :wb => @word_bigram } #{  :w=>{}, :wb=>{} }
  depth.times do |i|
    cutoff = cutoffs[ i ]
    @word = a[:w]
    @word_bigram = a[:wb]      
    a = a[:w].keys.inject( a ) do |a, uni|
      cs = sig_bigrams(uni, cutoff)
      cs.keys.each do |x| 
        new_uni = "#{uni} #{x}"
        a[:w][new_uni] = a[:wb][uni][x] rescue 0; 
        a[:wb][x].keys.each{|z| a[:wb][new_uni] ||= {}; a[:wb][new_uni][z] ||= {}; a[:wb][new_uni][z] = ( (a[:wb][uni][x]/@word_next[x].to_f)* (a[:wb][x][z]||0) ).to_i  } rescue ""
      end
      a[:w].delete(uni) if cs.size > 0 or a[:w][uni] < cutoff
      a
    end
  end
  a
end

#digest!(word_seq_array) ⇒ Object



11
12
13
14
15
16
17
18
19
20
21
22
23
24
# File 'lib/lite/ngrams.rb', line 11

def digest!( word_seq_array )
  (0..word_seq_array.size-1).each do |i|
    w = word_seq_array[i]
    @word[ w ] ||= 0
    @word[ w ] += 1
    next if i == word_seq_array.size-1
    next_w = word_seq_array[i+1]
    @word_bigram[ w ] ||= {}
    @word_bigram[ w ][next_w] ||= 0
    @word_bigram[ w ][next_w] += 1          
    @word_next[ next_w] ||= 0
    @word_next[ next_w ] += 1
  end
end

#extractObject



26
27
28
# File 'lib/lite/ngrams.rb', line 26

def extract
  calculate_ngrams()[ :w ].sort{|x1,x2| x2.last <=> x1.last}
end

#new_sample_no_replace(total, table, nitems) ⇒ Object



112
113
114
115
116
# File 'lib/lite/ngrams.rb', line 112

def new_sample_no_replace(total, table, nitems)
  cdf = CDFast.new table

  cdf.sample( nitems ).inject( {} ){|h,x| h[ x ] ||= 0; h[x] +=1; h}
end

#nth_item_from_table(table, n) ⇒ Object



129
130
131
132
133
134
135
136
# File 'lib/lite/ngrams.rb', line 129

def nth_item_from_table(table, n)
  sum = 0
  table.each do |wc|
    sum = sum + wc[1]
    return wc[0] if (n < sum) #table is sorted 
  end    
  table.last.first
end

#null_score(count, bigram, total, pvalue, perm_hash) ⇒ Object



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# File 'lib/lite/ngrams.rb', line 87

def null_score( count, bigram, total, pvalue, perm_hash )

  perm_key =  count/perm_hash # int div ..

  return @perms[perm_key] if @perms.has_key?  perm_key 

  max_score = 0
  nperm = (1.0 / pvalue).to_i
  table = bigram.to_a.sort{|a,b| b[1]<=>a[1]}
  (0..nperm).each do |perm|
    #perm_bigram = sample_no_replace(total, table, count)
    perm_bigram = new_sample_no_replace(total, bigram, count)
    obs_score = word_scores(count, bigram, perm_bigram, total, 1)
    obs_score = obs_score.values.max
    max_score = obs_score if (obs_score > max_score or perm == 0)
  end             
  @perms[perm_key] = max_score

  max_score
end

#safelog(x) ⇒ Object



108
109
110
# File 'lib/lite/ngrams.rb', line 108

def safelog x
  x< 0 ? x : x==0? -1000000 : Math.log( x ) 
end

#sample_no_replace(total, table, nitems) ⇒ Object



118
119
120
121
122
123
124
125
126
127
# File 'lib/lite/ngrams.rb', line 118

def sample_no_replace(total, table, nitems)    
  sample = (0..total).to_a.sample( nitems )
  count = {}
  sample.each do |n|
    w = nth_item_from_table(table, n)
    count[w] ||= 0
    count[w] += 1
  end
  count
end

#sig_bigrams(word, min) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
62
63
64
# File 'lib/lite/ngrams.rb', line 51

def sig_bigrams(word, min)
  return { } if @word_bigram[ word ].nil?||@word_bigram[ word ].empty?
   
  total = @word.values.inject(:+)  
  count = @word_bigram[word].values.inject(:+)
  sig_big = { }    
  scores = word_scores( count, @word, @word_bigram[word], total, min )
  scores.to_a.sort{|wc,zc| zc[1] <=> wc[1] }.each do |w,c|
    next if @word_bigram[word][w] < min 
    null_score = null_score( count, @word, total, 0.1, 10 )
    sig_big[w] = c if c > null_score 
  end
  sig_big
end

#word_scores(count, unigram, bigram, total, min_count) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# File 'lib/lite/ngrams.rb', line 66

def word_scores( count, unigram, bigram, total, min_count )
  val = Hash.new
  bigram.keys.each do |v|
    uni = unigram[v]||0
    big = bigram[v]||0
    next if big < min_count

    log_pi_vu = safelog(big) - safelog(count)
    log_pi_vnu = safelog(uni - big) - safelog(total - big)
    log_pi_v_old = safelog(uni) - safelog(total)
    log_1mp_v = safelog(1 - Math.exp(log_pi_vnu))
    log_1mp_vu = safelog(1 - Math.exp(log_pi_vu))
        
    val[v] = 2 * (big * log_pi_vu + \
                 (uni - big) * log_pi_vnu - \
                 uni * log_pi_v_old + \
                 (count - big) * (log_1mp_vu - log_1mp_v))
  end
  val
end