Class: Clusterer::Lsi
- Inherits:
-
Object
- Object
- Clusterer::Lsi
- Includes:
- Linalg
- Defined in:
- lib/clusterer/lsi/lsi.rb
Instance Attribute Summary collapse
-
#documents ⇒ Object
readonly
Returns the value of attribute documents.
Instance Method Summary collapse
- #<<(doc) ⇒ Object
- #clear_cached_results ⇒ Object
- #cluster_documents(k, options = { }) ⇒ Object
-
#initialize(docs) ⇒ Lsi
constructor
A new instance of Lsi.
- #perform_svd(cutoff = 0.80) ⇒ Object
- #rebuild_if_needed ⇒ Object
- #search(document, threshold = 0.5) ⇒ Object
Constructor Details
#initialize(docs) ⇒ Lsi
Returns a new instance of Lsi.
39 40 41 |
# File 'lib/clusterer/lsi/lsi.rb', line 39 def initialize(docs) @documents = docs end |
Instance Attribute Details
#documents ⇒ Object (readonly)
Returns the value of attribute documents.
38 39 40 |
# File 'lib/clusterer/lsi/lsi.rb', line 38 def documents @documents end |
Instance Method Details
#<<(doc) ⇒ Object
82 83 84 |
# File 'lib/clusterer/lsi/lsi.rb', line 82 def <<(doc) @documents << doc end |
#clear_cached_results ⇒ Object
47 48 49 |
# File 'lib/clusterer/lsi/lsi.rb', line 47 def clear_cached_results @t= @s= @d= @s_inv= @sd= nil end |
#cluster_documents(k, options = { }) ⇒ Object
63 64 65 66 67 68 69 |
# File 'lib/clusterer/lsi/lsi.rb', line 63 def cluster_documents(k, = { }) rebuild_if_needed cnt = -1 clusters = Algorithms.send([:algorithm] || :kmeans, sd.columns.collect{|c| c.position = (cnt += 1); c}, k, ) clusters.collect {|clus| clus.documents.collect {|d| @documents[d.position]}} end |
#perform_svd(cutoff = 0.80) ⇒ Object
51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/clusterer/lsi/lsi.rb', line 51 def perform_svd (cutoff = 0.80) matrix = DMatrix[*@documents].transpose @t, @s, @d = matrix.svd val = @s.trace * cutoff cnt = -1 (0..([@s.nrow, @s.ncol].min - 1)).inject(0) {|n,v| cnt += 1; (n > val) ? break : n + @s[v,v] } @t = DMatrix.join_columns((0..cnt).collect {|i|@t.column(i) }) @d = DMatrix.join_rows((0..cnt).collect {|i| @d.row(i) }) @s = DMatrix.join_columns((0..cnt).collect {|i|@s.column(i) }) @s = DMatrix.join_rows((0..cnt).collect {|i|@s.row(i) }) unless @s.ncol == cnt end |
#rebuild_if_needed ⇒ Object
43 44 45 |
# File 'lib/clusterer/lsi/lsi.rb', line 43 def rebuild_if_needed perform_svd unless @t && @d && @s end |
#search(document, threshold = 0.5) ⇒ Object
71 72 73 74 75 76 77 78 79 80 |
# File 'lib/clusterer/lsi/lsi.rb', line 71 def search(document, threshold = 0.5) rebuild_if_needed vec = $LINALG ? DMatrix[document] : DMatrix[document] #DMatrix[document] #transform_to_vector(document) vec = (vec * @t) * s_inv results = [] vec = (vec * @s).transpose # * @s vec = vec.column(0) unless $LINALG sd.columns.each_with_index {|d,i| results << documents[i] if d.cosine_similarity(vec) >= threshold} results end |