Module: Topical::Metrics

Extended by:
Metrics
Included in:
Metrics
Defined in:
lib/topical/metrics.rb

Instance Method Summary collapse

Instance Method Details

#compute_coherence(terms, documents, top_n: 10) ⇒ Object

Compute UMass Coherence for topic quality Higher coherence = more interpretable topic



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/topical/metrics.rb', line 11

def compute_coherence(terms, documents, top_n: 10)
  return 0.0 if terms.empty? || documents.empty?
  
  # Use top N terms
  eval_terms = terms.first(top_n)
  return 0.0 if eval_terms.length < 2
  
  # Create document term matrix for co-occurrence
  doc_term_counts = count_cooccurrences(eval_terms, documents)
  
  # Compute UMass coherence
  coherence_sum = 0.0
  pairs_count = 0
  
  eval_terms.each_with_index do |term_i, i|
    eval_terms.each_with_index do |term_j, j|
      next unless j < i  # Only upper triangle
      
      # P(term_i, term_j) = co-occurrence count
      cooccur = doc_term_counts["#{term_i},#{term_j}"] || 0
      # P(term_j) = document frequency
      doc_freq_j = doc_term_counts[term_j] || 0
      
      if cooccur > 0 && doc_freq_j > 0
        # UMass: log((cooccur + 1) / doc_freq_j)
        coherence_sum += Math.log((cooccur + 1.0) / doc_freq_j)
        pairs_count += 1
      end
    end
  end
  
  return 0.0 if pairs_count == 0
  
  # Normalize by number of pairs
  coherence = coherence_sum / pairs_count
  
  # Transform to 0-1 range (coherence is typically negative)
  # More negative = less coherent, so we reverse and bound
  normalized = 1.0 / (1.0 + Math.exp(-coherence))
  normalized
end

#compute_coverage(topics, total_documents) ⇒ Object

Compute coverage (what fraction of docs are in topics vs outliers)



104
105
106
107
108
109
# File 'lib/topical/metrics.rb', line 104

def compute_coverage(topics, total_documents)
  return 0.0 if total_documents == 0
  
  docs_in_topics = topics.sum(&:size)
  docs_in_topics.to_f / total_documents
end

#compute_distinctiveness(topic, other_topics) ⇒ Object

Compute how distinct a topic is from others



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/topical/metrics.rb', line 54

def compute_distinctiveness(topic, other_topics)
  return 1.0 if other_topics.empty?
  
  topic_terms = Set.new(topic.terms.first(20))
  
  # Compare with other topics
  overlaps = other_topics.map do |other|
    next if other.id == topic.id
    
    other_terms = Set.new(other.terms.first(20))
    overlap = (topic_terms & other_terms).size.to_f
    
    # Jaccard similarity
    union_size = (topic_terms | other_terms).size
    union_size > 0 ? overlap / union_size : 0
  end.compact
  
  return 1.0 if overlaps.empty?
  
  # Distinctiveness = 1 - average overlap
  1.0 - (overlaps.sum / overlaps.length)
end

#compute_diversity(topics) ⇒ Object

Compute diversity across all topics



78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/topical/metrics.rb', line 78

def compute_diversity(topics)
  return 0.0 if topics.length < 2
  
  # Collect all term sets
  term_sets = topics.map { |t| Set.new(t.terms.first(20)) }
  
  # Compute pairwise Jaccard distances
  distances = []
  term_sets.each_with_index do |set_i, i|
    term_sets.each_with_index do |set_j, j|
      next unless j > i  # Only upper triangle
      
      intersection = (set_i & set_j).size.to_f
      union = (set_i | set_j).size.to_f
      
      # Jaccard distance = 1 - Jaccard similarity
      distance = union > 0 ? 1.0 - (intersection / union) : 1.0
      distances << distance
    end
  end
  
  # Average distance = diversity
  distances.sum / distances.length
end

#compute_silhouette_score(topic, all_topics, embeddings) ⇒ Object

Silhouette score for cluster quality



112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# File 'lib/topical/metrics.rb', line 112

def compute_silhouette_score(topic, all_topics, embeddings)
  return 0.0 if topic.embeddings.empty?
  
  silhouettes = []
  
  topic.embeddings.each_with_index do |embedding, idx|
    # a(i) = average distance to other points in same cluster
    if topic.embeddings.length > 1
      a_i = topic.embeddings.each_with_index
                .reject { |_, j| j == idx }
                .map { |other, _| euclidean_distance(embedding, other) }
                .sum.to_f / (topic.embeddings.length - 1)
    else
      a_i = 0.0
    end
    
    # b(i) = minimum average distance to points in other clusters
    b_values = all_topics.reject { |t| t.id == topic.id }.map do |other_topic|
      next if other_topic.embeddings.empty?
      
      avg_dist = other_topic.embeddings
                           .map { |other| euclidean_distance(embedding, other) }
                           .sum.to_f / other_topic.embeddings.length
      avg_dist
    end.compact
    
    b_i = b_values.min || a_i
    
    # Silhouette coefficient
    if a_i == 0 && b_i == 0
      s_i = 0
    else
      s_i = (b_i - a_i) / [a_i, b_i].max
    end
    
    silhouettes << s_i
  end
  
  # Average silhouette score for topic
  silhouettes.sum / silhouettes.length
end