Class: Clusterer::Lsi

Inherits:
Object
  • Object
show all
Includes:
Linalg
Defined in:
lib/clusterer/lsi/lsi.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(docs) ⇒ Lsi

Returns a new instance of Lsi.



39
40
41
# File 'lib/clusterer/lsi/lsi.rb', line 39

def initialize(docs)
  @documents = docs
end

Instance Attribute Details

#documentsObject (readonly)

Returns the value of attribute documents.



38
39
40
# File 'lib/clusterer/lsi/lsi.rb', line 38

def documents
  @documents
end

Instance Method Details

#<<(doc) ⇒ Object



82
83
84
# File 'lib/clusterer/lsi/lsi.rb', line 82

def <<(doc)
  @documents << doc
end

#clear_cached_resultsObject



47
48
49
# File 'lib/clusterer/lsi/lsi.rb', line 47

def clear_cached_results
  @t= @s= @d= @s_inv= @sd= nil
end

#cluster_documents(k, options = { }) ⇒ Object



63
64
65
66
67
68
69
# File 'lib/clusterer/lsi/lsi.rb', line 63

def cluster_documents(k, options = { })
  rebuild_if_needed
  cnt = -1
  clusters = Algorithms.send(options[:algorithm] || :kmeans, 
                             sd.columns.collect{|c| c.position = (cnt += 1); c}, k, options)
  clusters.collect {|clus| clus.documents.collect {|d| @documents[d.position]}}
end

#perform_svd(cutoff = 0.80) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
# File 'lib/clusterer/lsi/lsi.rb', line 51

def perform_svd (cutoff = 0.80)
  matrix = DMatrix[*@documents].transpose
  @t, @s, @d =  matrix.svd
  val = @s.trace * cutoff
  cnt = -1
  (0..([@s.nrow, @s.ncol].min - 1)).inject(0) {|n,v| cnt += 1; (n > val) ? break : n + @s[v,v] }
  @t = DMatrix.join_columns((0..cnt).collect {|i|@t.column(i) })
  @d = DMatrix.join_rows((0..cnt).collect {|i| @d.row(i) })
  @s = DMatrix.join_columns((0..cnt).collect {|i|@s.column(i) })
  @s = DMatrix.join_rows((0..cnt).collect {|i|@s.row(i) }) unless @s.ncol == cnt
end

#rebuild_if_neededObject



43
44
45
# File 'lib/clusterer/lsi/lsi.rb', line 43

def rebuild_if_needed
  perform_svd unless @t && @d && @s
end

#search(document, threshold = 0.5) ⇒ Object



71
72
73
74
75
76
77
78
79
80
# File 'lib/clusterer/lsi/lsi.rb', line 71

def search(document, threshold = 0.5)
  rebuild_if_needed
  vec = $LINALG ? DMatrix[document] : DMatrix[document] #DMatrix[document] #transform_to_vector(document)
  vec = (vec * @t) * s_inv
  results = []
  vec = (vec * @s).transpose # * @s
  vec = vec.column(0) unless $LINALG
  sd.columns.each_with_index {|d,i| results << documents[i] if d.cosine_similarity(vec) >= threshold}
  results
end