Class: A2A::Monitoring::Alerting

Inherits:
Object
  • Object
show all
Defined in:
lib/a2a/monitoring/alerting.rb

Overview

Alerting system for A2A monitoring

Provides configurable alerting based on metrics thresholds, error rates, and system health indicators.

Constant Summary collapse

SEVERITY_INFO =

Alert severities

:info
SEVERITY_WARNING =
:warning
SEVERITY_ERROR =
:error
SEVERITY_CRITICAL =
:critical
STATE_FIRING =

Alert states

:firing
STATE_RESOLVED =
:resolved

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(config = {}) ⇒ Alerting

Initialize alerting system

Parameters:

  • (defaults to: {})

    Alerting configuration



31
32
33
34
35
36
37
# File 'lib/a2a/monitoring/alerting.rb', line 31

def initialize(config = {})
  @rules = []
  @channels = []
  @active_alerts = {}
  @config = default_config.merge(config)
  @mutex = Mutex.new
end

Instance Attribute Details

#active_alertsObject (readonly)

Returns the value of attribute active_alerts.



25
26
27
# File 'lib/a2a/monitoring/alerting.rb', line 25

def active_alerts
  @active_alerts
end

#channelsObject (readonly)

Returns the value of attribute channels.



25
26
27
# File 'lib/a2a/monitoring/alerting.rb', line 25

def channels
  @channels
end

#rulesObject (readonly)

Returns the value of attribute rules.



25
26
27
# File 'lib/a2a/monitoring/alerting.rb', line 25

def rules
  @rules
end

Instance Method Details

#add_channel(channel) ⇒ Object

Add an alert channel

Parameters:

  • Alert channel (webhook, email, etc.)



68
69
70
# File 'lib/a2a/monitoring/alerting.rb', line 68

def add_channel(channel)
  @mutex.synchronize { @channels << channel }
end

#add_rule(name:, metric:, condition:, severity: SEVERITY_WARNING, description: nil, **tags) ⇒ Object

Add an alert rule

Parameters:

  • Rule name

  • Metric to monitor

  • Alert condition

  • (defaults to: SEVERITY_WARNING)

    Alert severity

  • (defaults to: nil)

    Alert description

  • Additional tags



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/a2a/monitoring/alerting.rb', line 48

def add_rule(name:, metric:, condition:, severity: SEVERITY_WARNING, description: nil, **tags)
  rule = {
    name: name,
    metric: metric,
    condition: condition,
    severity: severity,
    description: description || "Alert for #{metric}",
    tags: tags,
    created_at: Time.now,
    last_evaluated: nil,
    evaluation_count: 0
  }

  @mutex.synchronize { @rules << rule }
end

#cleanup_resolved_alerts(max_age: 3600) ⇒ Object

Clear resolved alerts older than specified time

Parameters:

  • (defaults to: 3600)

    Maximum age in seconds (default: 1 hour)



112
113
114
115
116
117
118
119
120
# File 'lib/a2a/monitoring/alerting.rb', line 112

def cleanup_resolved_alerts(max_age: 3600)
  cutoff_time = Time.now - max_age

  @mutex.synchronize do
    @active_alerts.reject! do |_, alert|
      alert[:state] == STATE_RESOLVED && alert[:resolved_at] < cutoff_time
    end
  end
end

#count_alerts_by_severityHash (private)

Count alerts by severity

Returns:

  • Alert counts by severity



305
306
307
308
309
310
311
312
313
314
315
# File 'lib/a2a/monitoring/alerting.rb', line 305

def count_alerts_by_severity
  counts = Hash.new(0)

  @active_alerts.each_value do |alert|
    next unless alert[:state] == STATE_FIRING

    counts[alert[:severity]] += 1
  end

  counts
end

#default_configHash (private)

Default configuration

Returns:

  • Default configuration



128
129
130
131
132
133
134
# File 'lib/a2a/monitoring/alerting.rb', line 128

def default_config
  {
    evaluation_interval: 60,
    alert_timeout: 300,
    max_alerts: 1000
  }
end

#evaluate_condition(condition, value) ⇒ Boolean (private)

Evaluate alert condition

Parameters:

  • Alert condition

  • Current value

Returns:

  • Whether condition is met



226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/a2a/monitoring/alerting.rb', line 226

def evaluate_condition(condition, value)
  operator = condition[:operator] || condition[:op]
  threshold = condition[:threshold]

  case operator.to_s
  when "gt", ">" then value > threshold
  when "gte", ">=" then value >= threshold
  when "lt", "<" then value < threshold
  when "lte", "<=" then value <= threshold
  when "eq", "==" then value == threshold
  when "ne", "!=" then value != threshold
  else false
  end
end

#evaluate_metric_against_rule(rule, metric_key, metric_data) ⇒ Object (private)

Evaluate a specific metric against a rule

Parameters:

  • Alert rule

  • Metric key

  • Metric data



177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
# File 'lib/a2a/monitoring/alerting.rb', line 177

def evaluate_metric_against_rule(rule, metric_key, metric_data)
  alert_key = "#{rule[:name]}_#{metric_key}"
  condition = rule[:condition]

  # Extract value based on metric type
  value = extract_metric_value(metric_data, condition[:field])
  return unless value

  # Evaluate condition
  should_fire = evaluate_condition(condition, value)

  if should_fire
    fire_alert(alert_key, rule, metric_key, value)
  else
    resolve_alert(alert_key)
  end
end

#evaluate_rule(rule, metrics) ⇒ Object (private)

Evaluate a single alert rule

Parameters:

  • Alert rule

  • Current metrics



141
142
143
144
145
146
147
148
149
150
151
# File 'lib/a2a/monitoring/alerting.rb', line 141

def evaluate_rule(rule, metrics)
  rule[:last_evaluated] = Time.now
  rule[:evaluation_count] += 1

  # Find matching metrics
  matching_metrics = find_matching_metrics(rule[:metric], metrics)

  matching_metrics.each do |metric_key, metric_data|
    evaluate_metric_against_rule(rule, metric_key, metric_data)
  end
end

#evaluate_rules(metrics) ⇒ Object

Evaluate all alert rules against current metrics

Parameters:

  • Current metrics snapshot



76
77
78
79
80
81
82
# File 'lib/a2a/monitoring/alerting.rb', line 76

def evaluate_rules(metrics)
  @mutex.synchronize do
    @rules.each do |rule|
      evaluate_rule(rule, metrics)
    end
  end
end

#extract_metric_value(metric_data, field = nil) ⇒ Numeric? (private)

Extract value from metric data

Parameters:

  • Metric data

  • (defaults to: nil)

    Specific field to extract

Returns:

  • Extracted value



201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
# File 'lib/a2a/monitoring/alerting.rb', line 201

def extract_metric_value(metric_data, field = nil)
  case metric_data[:type]
  when :counter, :gauge
    metric_data[:value]
  when :histogram
    case field
    when "avg", "average" then metric_data[:avg]
    when "p95" then metric_data[:p95]
    when "p99" then metric_data[:p99]
    when "max" then metric_data[:max]
    when "min" then metric_data[:min]
    when "count" then metric_data[:count]
    else metric_data[:avg] # Default to average
    end
  else
    metric_data[:value]
  end
end

#find_matching_metrics(pattern, metrics) ⇒ Hash (private)

Find metrics matching the rule pattern

Parameters:

  • Metric pattern

  • All metrics

Returns:

  • Matching metrics



159
160
161
162
163
164
165
166
167
168
169
# File 'lib/a2a/monitoring/alerting.rb', line 159

def find_matching_metrics(pattern, metrics)
  if pattern.include?("*")
    # Pattern matching
    regex = Regexp.new(pattern.gsub("*", ".*"))
    metrics.select { |key, _| key.match?(regex) }
  else
    # Exact match
    metric_data = metrics[pattern]
    metric_data ? { pattern => metric_data } : {}
  end
end

#fire_alert(alert_key, rule, metric_key, value) ⇒ Object (private)

Fire an alert

Parameters:

  • Alert key

  • Alert rule

  • Metric key

  • Current value



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'lib/a2a/monitoring/alerting.rb', line 248

def fire_alert(alert_key, rule, metric_key, value)
  existing_alert = @active_alerts[alert_key]

  # Don't fire if already active and within timeout
  if existing_alert && existing_alert[:state] == STATE_FIRING
    time_since_fired = Time.now - existing_alert[:fired_at]
    return if time_since_fired < @config[:alert_timeout]
  end

  alert = {
    key: alert_key,
    rule_name: rule[:name],
    metric: metric_key,
    value: value,
    condition: rule[:condition],
    severity: rule[:severity],
    description: rule[:description],
    tags: rule[:tags],
    state: STATE_FIRING,
    fired_at: Time.now,
    resolved_at: nil
  }

  @active_alerts[alert_key] = alert
  send_alert_notification(alert)
end

#get_active_alertsArray<Hash>

Get active alerts

Returns:

  • Active alerts



88
89
90
# File 'lib/a2a/monitoring/alerting.rb', line 88

def get_active_alerts
  @mutex.synchronize { @active_alerts.values.dup }
end

#get_evaluation_statsHash (private)

Get evaluation statistics

Returns:

  • Evaluation statistics



321
322
323
324
325
326
327
328
329
330
331
332
# File 'lib/a2a/monitoring/alerting.rb', line 321

def get_evaluation_stats
  return {} if @rules.empty?

  total_evaluations = @rules.sum { |rule| rule[:evaluation_count] }
  avg_evaluations = total_evaluations.to_f / @rules.size

  {
    total_evaluations: total_evaluations,
    average_evaluations_per_rule: avg_evaluations,
    last_evaluation: @rules.pluck(:last_evaluated).compact.max
  }
end

#resolve_alert(alert_key) ⇒ Object (private)

Resolve an alert

Parameters:

  • Alert key



279
280
281
282
283
284
285
286
287
# File 'lib/a2a/monitoring/alerting.rb', line 279

def resolve_alert(alert_key)
  alert = @active_alerts[alert_key]
  return unless alert && alert[:state] == STATE_FIRING

  alert[:state] = STATE_RESOLVED
  alert[:resolved_at] = Time.now

  send_alert_notification(alert)
end

#send_alert_notification(alert) ⇒ Object (private)

Send alert notification to all channels

Parameters:

  • Alert data



293
294
295
296
297
298
299
# File 'lib/a2a/monitoring/alerting.rb', line 293

def send_alert_notification(alert)
  @channels.each do |channel|
    channel.send_alert(alert)
  rescue StandardError => e
    warn "Failed to send alert via #{channel.class}: #{e.message}"
  end
end

#statisticsHash

Get alert statistics

Returns:

  • Alert statistics



96
97
98
99
100
101
102
103
104
105
106
# File 'lib/a2a/monitoring/alerting.rb', line 96

def statistics
  @mutex.synchronize do
    {
      total_rules: @rules.size,
      active_alerts: @active_alerts.size,
      total_channels: @channels.size,
      alerts_by_severity: count_alerts_by_severity,
      evaluation_stats: get_evaluation_stats
    }
  end
end