Class: Topical::DimensionalityReducer

Inherits:
Object
  • Object
show all
Defined in:
lib/topical/dimensionality_reducer.rb

Overview

Handles dimensionality reduction for embeddings using UMAP

Instance Method Summary collapse

Constructor Details

#initialize(n_components: 50, logger: nil) ⇒ DimensionalityReducer



8
9
10
11
# File 'lib/topical/dimensionality_reducer.rb', line 8

def initialize(n_components: 50, logger: nil)
  @n_components = n_components
  @logger = logger || Logger.new(IO::NULL, level: Logger::FATAL)
end

Instance Method Details

#reduce(embeddings) ⇒ Array<Array<Float>>

Reduce dimensionality of embeddings if needed



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# File 'lib/topical/dimensionality_reducer.rb', line 16

def reduce(embeddings)
  return embeddings if embeddings.empty?
  return embeddings if embeddings.first.length <= @n_components
  
  begin
    require 'clusterkit'
    
    # Validate embeddings before UMAP
    valid_embeddings, invalid_indices = validate_embeddings_for_umap(embeddings)
    
    if valid_embeddings.empty?
      raise "No valid embeddings for dimensionality reduction. " \
            "All embeddings contain invalid values (NaN, Infinity, or non-numeric)."
    end
    
    if invalid_indices.any?
      @logger.warn "  Warning: #{invalid_indices.size} embeddings with invalid values removed"
    end
    
    # Adjust parameters based on data size
    n_samples = valid_embeddings.size
    n_components = [@n_components, n_samples - 1, 50].min
    n_neighbors = [15, n_samples - 1].min
    
    if n_components != @n_components
      @logger.info "  Adjusted n_components to #{n_components} (was #{@n_components}) for #{n_samples} samples"
    end
    
    umap = ClusterKit::Dimensionality::UMAP.new(
      n_components: n_components,
      n_neighbors: n_neighbors,
      random_seed: 42
    )
    
    reduced = umap.fit_transform(valid_embeddings)
    
    # If we had to remove invalid embeddings, reconstruct the full array
    if invalid_indices.any?
      full_reduced = []
      valid_idx = 0
      embeddings.size.times do |i|
        if invalid_indices.include?(i)
          # Use zeros for invalid embeddings (they'll be outliers anyway)
          full_reduced << Array.new(n_components, 0.0)
        else
          full_reduced << reduced[valid_idx]
          valid_idx += 1
        end
      end
      full_reduced
    else
      reduced
    end
  rescue LoadError
    @logger.warn "Warning: Dimensionality reduction requires ClusterKit. Using original embeddings."
    embeddings
  rescue => e
    @logger.warn "Warning: Dimensionality reduction failed: #{e.message}"
    embeddings
  end
end