Class: Topical::Engine

Inherits:
Object
  • Object
show all
Defined in:
lib/topical/engine.rb

Overview

Main engine for topic modeling

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(clustering_method: :hdbscan, min_cluster_size: 5, min_samples: 3, reduce_dimensions: true, n_components: 50, labeling_method: :term_based, verbose: false, logger: nil, k: nil, **options) ⇒ Engine

Returns a new instance of Engine.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/topical/engine.rb', line 10

def initialize(
  clustering_method: :hdbscan,
  min_cluster_size: 5,
  min_samples: 3,
  reduce_dimensions: true,
  n_components: 50,
  labeling_method: :term_based,
  verbose: false,
  logger: nil,
  k: nil,  # Add k as explicit parameter
  **options
)
  @clustering_method = clustering_method
  @min_cluster_size = min_cluster_size
  @min_samples = min_samples
  @reduce_dimensions = reduce_dimensions
  @n_components = n_components
  @labeling_method = labeling_method
  @verbose = verbose
  @logger = setup_logger(logger, verbose)
  @options = options
  @options[:k] = k if k  # Store k in options if provided
  
  @clustering_adapter = build_clustering_adapter
  @term_extractor = Extractors::TermExtractor.new
  @labeler = build_labeler
  @dimensionality_reducer = DimensionalityReducer.new(
    n_components: @n_components,
    logger: @logger
  )
  @topics = []
end

Instance Attribute Details

#clustering_adapterObject (readonly)

Returns the value of attribute clustering_adapter.



8
9
10
# File 'lib/topical/engine.rb', line 8

def clustering_adapter
  @clustering_adapter
end

#labelerObject (readonly)

Returns the value of attribute labeler.



8
9
10
# File 'lib/topical/engine.rb', line 8

def labeler
  @labeler
end

#term_extractorObject (readonly)

Returns the value of attribute term_extractor.



8
9
10
# File 'lib/topical/engine.rb', line 8

def term_extractor
  @term_extractor
end

#topicsObject (readonly)

Returns the value of attribute topics.



8
9
10
# File 'lib/topical/engine.rb', line 8

def topics
  @topics
end

Class Method Details

.load(path) ⇒ Object

Load a model



118
119
120
# File 'lib/topical/engine.rb', line 118

def self.load(path)
  ModelSerializer.load(path)
end

Instance Method Details

#fit(embeddings:, documents:, metadata: nil) ⇒ Array<Topic>

Fit the model to embeddings and documents

Parameters:

  • embeddings (Array<Array<Float>>)

    Document embeddings

  • documents (Array<String>)

    Document texts

  • metadata (Array<Hash>) (defaults to: nil)

    Optional metadata for each document

Returns:

  • (Array<Topic>)

    Extracted topics

Raises:

  • (ArgumentError)


48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/topical/engine.rb', line 48

def fit(embeddings:, documents:, metadata: nil)
  raise ArgumentError, "Embeddings and documents must have same length" unless embeddings.length == documents.length
  
  @embeddings = embeddings
  @documents = documents
  @metadata =  || Array.new(documents.length) { {} }
  
  @logger.info "Starting topic extraction..."
  
  # Step 1: Optionally reduce dimensions
  working_embeddings = @embeddings
  if @reduce_dimensions && !@embeddings.empty? && @embeddings.first.length > @n_components
    @logger.info "  Reducing dimensions from #{@embeddings.first.length} to #{@n_components}..."
    working_embeddings = @dimensionality_reducer.reduce(@embeddings)
  end
  
  # Step 2: Cluster embeddings
  @logger.info "  Clustering #{working_embeddings.length} documents..."
  @cluster_ids = @clustering_adapter.fit_predict(working_embeddings)
  
  # Step 3: Build topics from clusters
  @logger.info "  Building topics from clusters..."
  @topics = build_topics(@cluster_ids)
  
  # Step 4: Extract terms for each topic
  @logger.info "  Extracting distinctive terms..."
  extract_topic_terms
  
  # Step 5: Generate labels
  @logger.info "  Generating topic labels..."
  generate_topic_labels
  
  if @verbose
    n_noise = @cluster_ids.count(-1)
    @logger.info "Found #{@topics.length} topics (plus #{n_noise} outliers)"
  end
  
  @topics
end

#get_topic(topic_id) ⇒ Object



101
102
103
# File 'lib/topical/engine.rb', line 101

def get_topic(topic_id)
  @topics.find { |t| t.id == topic_id }
end

#outliersObject



105
106
107
108
109
110
# File 'lib/topical/engine.rb', line 105

def outliers
  return [] unless @cluster_ids
  @documents.each_with_index.select { |_, idx| 
    @cluster_ids[idx] == -1 
  }.map(&:first)
end

#save(path) ⇒ Object

Save the model



113
114
115
# File 'lib/topical/engine.rb', line 113

def save(path)
  ModelSerializer.save(self, path)
end

#transform(embeddings:, documents: nil) ⇒ Object

Transform new documents using fitted model



89
90
91
92
93
94
95
96
97
98
99
# File 'lib/topical/engine.rb', line 89

def transform(embeddings:, documents: nil)
  raise "Must call fit before transform" if @topics.empty?
  
  # Use approximate prediction if available
  if @clustering_adapter.respond_to?(:approximate_predict)
    @clustering_adapter.approximate_predict(embeddings)
  else
    # Fallback: assign to nearest topic centroid
    assign_to_nearest_topic(embeddings: embeddings)
  end
end