Class: ActiveMatrix::Metrics

Inherits:
Object
  • Object
show all
Includes:
Singleton
Defined in:
lib/active_matrix/metrics.rb

Overview

Metrics collection for Matrix agent operations Provides structured metrics that can be exported to monitoring systems

Examples:

Getting agent metrics

metrics = ActiveMatrix::Metrics.instance.get_agent_metrics('agent_123')
puts metrics[:overall_success_rate]

Getting health summary

summary = ActiveMatrix::Metrics.instance.get_health_summary
puts "Healthy agents: #{summary[:healthy_agents]}"

Instance Method Summary collapse

Constructor Details

#initializeMetrics

Returns a new instance of Metrics.



21
22
23
24
25
# File 'lib/active_matrix/metrics.rb', line 21

def initialize
  @metrics = Concurrent::Hash.new
  @component_metrics = Concurrent::Hash.new
  setup_notification_subscribers
end

Instance Method Details

#get_agent_metrics(agent_id) ⇒ Hash

Get metrics for a specific agent instance

Parameters:

  • agent_id (String)

    Agent identifier

Returns:

  • (Hash)

    Agent metrics including components, success rates, and health status



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# File 'lib/active_matrix/metrics.rb', line 85

def get_agent_metrics(agent_id)
  agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }

  return {} if agent_metrics.empty?

  components = {}
  total_operations = 0
  total_successes = 0
  total_errors = 0

  agent_metrics.each do |key, metrics|
    parts = key.split(':', 3)
    component = parts[1]
    operation = parts[2]

    components[component] ||= {
      operations: {},
      total_count: 0,
      success_count: 0,
      error_count: 0
    }

    total_count = metrics[:total_count].value
    success_count = metrics[:success_count].value
    error_count = metrics[:error_count].value

    components[component][:total_count] += total_count
    components[component][:success_count] += success_count
    components[component][:error_count] += error_count

    total_operations += total_count
    total_successes += success_count
    total_errors += error_count

    components[component][:operations][operation] = {
      total_count: total_count,
      success_count: success_count,
      error_count: error_count,
      success_rate: calculate_success_rate(metrics),
      avg_duration_ms: metrics[:duration_stats][:avg].value,
      p95_duration_ms: metrics[:duration_stats][:p95].value,
      last_operation_at: metrics[:last_operation_at],
      last_error_at: metrics[:last_error_at],
      error_breakdown: serialize_error_breakdown(metrics[:error_breakdown])
    }
  end

  {
    agent_id: agent_id,
    total_operations: total_operations,
    total_successes: total_successes,
    total_errors: total_errors,
    overall_success_rate: total_operations.positive? ? (total_successes.to_f / total_operations * 100).round(2) : 0,
    components: components,
    health_status: calculate_agent_health(total_operations, total_successes)
  }
end

#get_component_metrics(agent_id, component) ⇒ Hash

Get metrics for a specific component

Parameters:

  • agent_id (String)

    Agent identifier

  • component (String)

    Component name

Returns:

  • (Hash)

    Component metrics



148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/active_matrix/metrics.rb', line 148

def get_component_metrics(agent_id, component)
  component_key = "#{agent_id}:#{component}"
  component_metrics = @component_metrics[component_key]

  return default_component_metrics if component_metrics.nil?

  operations = @metrics.select { |key, _| key.start_with?("#{component_key}:") }

  {
    component: component,
    agent_id: agent_id,
    total_operations: component_metrics[:total_count].value,
    success_count: component_metrics[:success_count].value,
    error_count: component_metrics[:error_count].value,
    success_rate: calculate_success_rate(component_metrics),
    avg_duration_ms: component_metrics[:duration_stats][:avg].value,
    p95_duration_ms: component_metrics[:duration_stats][:p95].value,
    operations: operations.transform_keys { |k| k.split(':', 3).last }
                          .transform_values { |v| operation_summary(v) }
  }
end

#get_health_summaryHash

Get health summary for all agents

Returns:

  • (Hash)

    Summary of agent health across the system



223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
# File 'lib/active_matrix/metrics.rb', line 223

def get_health_summary
  agent_ids = @metrics.keys.map { |key| key.split(':', 2).first }.uniq

  agents = agent_ids.map { |agent_id| get_agent_metrics(agent_id) }

  {
    total_agents: agents.length,
    healthy_agents: agents.count { |a| a[:health_status] == :healthy },
    degraded_agents: agents.count { |a| a[:health_status] == :degraded },
    unhealthy_agents: agents.count { |a| a[:health_status] == :unhealthy },
    total_operations: agents.sum { |a| a[:total_operations] },
    overall_success_rate: calculate_overall_success_rate(agents),
    agents: agents.map do |agent|
      {
        agent_id: agent[:agent_id],
        health_status: agent[:health_status],
        success_rate: agent[:overall_success_rate],
        total_operations: agent[:total_operations]
      }
    end
  }
end

#recent_errors(agent_id, limit: 20) ⇒ Array<Hash>

Get recent errors

Parameters:

  • agent_id (String)

    Agent identifier

  • limit (Integer) (defaults to: 20)

    Maximum number of errors to return

Returns:

  • (Array<Hash>)

    Recent errors sorted by timestamp (newest first)



197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/active_matrix/metrics.rb', line 197

def recent_errors(agent_id, limit: 20)
  agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }
  errors = []

  agent_metrics.each do |key, metrics|
    parts = key.split(':', 3)
    component = parts[1]
    operation = parts[2]

    metrics[:recent_operations].to_a.select { |op| op[:status] == 'error' }.each do |error_op|
      errors << {
        timestamp: error_op[:timestamp],
        component: component,
        operation: operation,
        duration_ms: error_op[:duration_ms],
        metadata: error_op[:metadata]
      }
    end
  end

  errors.sort_by { |e| -e[:timestamp].to_f }.first(limit)
end

#record_operation(operation, component:, agent_id:, status:, duration_ms:, error_class: nil, **metadata) ⇒ Object

Record operation metrics

rubocop:disable Metrics/ParameterLists

Parameters:

  • operation (Symbol, String)

    Operation name

  • component (String)

    Component name (e.g., ‘MessageDispatcher’)

  • agent_id (String)

    Agent identifier

  • status (String)

    ‘success’ or ‘error’

  • duration_ms (Float)

    Operation duration in milliseconds

  • error_class (String, nil) (defaults to: nil)

    Error class name if status is ‘error’

  • metadata (Hash)

    Additional metadata (user_id, room_id, etc.)



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/active_matrix/metrics.rb', line 37

def record_operation(operation, component:, agent_id:, status:, duration_ms:, error_class: nil, **)
  component_key = "#{agent_id}:#{component}"
  operation_key = "#{component_key}:#{operation}"

  # Initialize metrics if needed
  @component_metrics[component_key] ||= initialize_component_metrics(component, agent_id)
  @metrics[operation_key] ||= initialize_operation_metrics(operation, component, agent_id)

  # Update component-level metrics
  update_component_metrics(@component_metrics[component_key], status, duration_ms)

  # Update operation-level metrics
  metric = @metrics[operation_key]
  metric[:total_count].increment
  metric[:last_operation_at] = Time.current

  # Always update duration stats regardless of status
  update_duration_stats(metric[:duration_stats], duration_ms)

  case status
  when 'success'
    metric[:success_count].increment
  when 'error'
    metric[:error_count].increment
    metric[:last_error_at] = Time.current

    error_type = error_class || [:error_type] || 'unknown'
    metric[:error_breakdown][error_type] ||= Concurrent::AtomicFixnum.new(0)
    metric[:error_breakdown][error_type].increment
  end

  # Track recent operations (sliding window) with thread-safe array
  metric[:recent_operations] << {
    timestamp: Time.current,
    status: status,
    duration_ms: duration_ms,
    metadata: .merge(error_class: error_class).slice(:error_type, :error_class, :user_id, :room_id)
  }

  # Keep only last 100 operations
  metric[:recent_operations].shift if metric[:recent_operations].size > 100
end

#reset!Object

Reset all metrics (useful for testing)



247
248
249
250
# File 'lib/active_matrix/metrics.rb', line 247

def reset!
  @metrics.clear
  @component_metrics.clear
end

#reset_agent!(agent_id) ⇒ Object

Reset metrics for specific agent

Parameters:

  • agent_id (String)

    Agent identifier



255
256
257
258
259
# File 'lib/active_matrix/metrics.rb', line 255

def reset_agent!(agent_id)
  @metrics.delete_if { |key, _| key.start_with?("#{agent_id}:") }
  @component_metrics.delete_if { |key, _| key.start_with?("#{agent_id}:") }
  ActiveMatrix.logger.info("Reset metrics for Matrix agent: #{agent_id}")
end

#top_operations_by_volume(agent_id, limit: 10) ⇒ Array<Hash>

Get top operations by volume

Parameters:

  • agent_id (String)

    Agent identifier

  • limit (Integer) (defaults to: 10)

    Maximum number of operations to return

Returns:

  • (Array<Hash>)

    Top operations sorted by count



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/active_matrix/metrics.rb', line 175

def top_operations_by_volume(agent_id, limit: 10)
  agent_metrics = @metrics.select { |key, _| key.start_with?("#{agent_id}:") }

  operations = agent_metrics.map do |key, metrics|
    parts = key.split(':', 3)
    {
      component: parts[1],
      operation: parts[2],
      count: metrics[:total_count].value,
      success_rate: calculate_success_rate(metrics),
      avg_duration_ms: metrics[:duration_stats][:avg].value
    }
  end

  operations.sort_by { |op| -op[:count] }.first(limit)
end