Class: Riemann::Tools::Health

Inherits:
Object
  • Object
show all
Includes:
Riemann::Tools, Utils
Defined in:
lib/riemann/tools/health.rb

Constant Summary collapse

PROC_PID_INIT_INO =
0xEFFFFFFC
SI_UNITS =
'_kMGTPEZYRQ'

Constants included from Riemann::Tools

VERSION

Instance Attribute Summary

Attributes included from Riemann::Tools

#argv

Instance Method Summary collapse

Methods included from Utils

#distance_of_time_in_words_to_utcnow, #pluralize_string, #reverse_numeric_sort_with_header, #utcnow, #when_from_utcnow

Methods included from Riemann::Tools

#attributes, #endpoint_name, included, #options, #report, #riemann, #run

Constructor Details

#initializeHealth

Returns a new instance of Health.



37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# File 'lib/riemann/tools/health.rb', line 37

def initialize
  super

  @limits = {
    cpu: { critical: opts[:cpu_critical], warning: opts[:cpu_warning] },
    disk: { critical: opts[:disk_critical], warning: opts[:disk_warning], critical_leniency_kb: human_size_to_number(opts[:disk_critical_leniency]) / 1024, warning_leniency_kb: human_size_to_number(opts[:disk_warning_leniency]) / 1024 },
    load: { critical: opts[:load_critical], warning: opts[:load_warning] },
    memory: { critical: opts[:memory_critical], warning: opts[:memory_warning] },
    uptime: { critical: opts[:uptime_critical], warning: opts[:uptime_warning] },
    users: { critical: opts[:users_critical], warning: opts[:users_warning] },
    swap: { critical: opts[:swap_critical], warning: opts[:swap_warning] },
  }
  case (@ostype = `uname -s`.chomp.downcase)
  when 'darwin'
    @cores = `sysctl -n hw.ncpu`.to_i
    @cpu = method :darwin_cpu
    @disk = method :disk
    @load = method :darwin_load
    @memory = method :darwin_memory
    @uptime = method :bsd_uptime
    @swap = method :bsd_swap
  when 'freebsd'
    @cores = `sysctl -n hw.ncpu`.to_i
    @cpu = method :freebsd_cpu
    @disk = method :disk
    @load = method :bsd_load
    @memory = method :freebsd_memory
    @uptime = method :bsd_uptime
    @swap = method :bsd_swap
  when 'openbsd'
    @cores = `sysctl -n hw.ncpu`.to_i
    @cpu = method :openbsd_cpu
    @disk = method :disk
    @load = method :bsd_load
    @memory = method :openbsd_memory
    @uptime = method :bsd_uptime
    @swap = method :bsd_swap
  when 'sunos'
    @cores = `mpstat -a 2>/dev/null`.split[33].to_i
    @cpu = method :sunos_cpu
    @disk = method :disk
    @load = method :bsd_load
    @memory = method :sunos_memory
    @uptime = method :bsd_uptime
    @swap = method :bsd_swap
  else
    @cores = `nproc`.to_i
    puts "WARNING: OS '#{@ostype}' not explicitly supported. Falling back to Linux" unless @ostype == 'linux'
    @cpu = method :linux_cpu
    @disk = method :disk
    @load = method :linux_load
    @memory = method :linux_memory
    @uptime = method :linux_uptime
    @swap = method :linux_swap
    @supports_exclude_type = `df --help 2>&1 | grep -e "--exclude-type"` != ''
  end
  @users = method :users

  opts[:checks].each do |check|
    case check
    when 'disk'
      @disk_enabled = true
    when 'load'
      @load_enabled = true
    when 'cpu'
      @cpu_enabled = true
    when 'memory'
      @memory_enabled = true
    when 'uptime'
      @uptime_enabled = true
    when 'users'
      @users_enabled = true
    when 'swap'
      @swap_enabled = true
    end
  end

  invalidate_cache
end

Instance Method Details

#alert(service, state, metric, description) ⇒ Object



117
118
119
120
121
122
123
124
# File 'lib/riemann/tools/health.rb', line 117

def alert(service, state, metric, description)
  report(
    service: service.to_s,
    state: state.to_s,
    metric: metric.to_f,
    description: description,
  )
end

#bsd_loadObject



321
322
323
324
325
326
327
328
329
330
# File 'lib/riemann/tools/health.rb', line 321

def bsd_load
  load = uptime[:load_averages][1] / @cores
  if load > @limits[:load][:critical]
    alert 'load', :critical, load, "1-minute load average/core is #{load}"
  elsif load > @limits[:load][:warning]
    alert 'load', :warning, load, "1-minute load average/core is #{load}"
  else
    alert 'load', :ok, load, "1-minute load average/core is #{load}"
  end
end

#bsd_swapObject



471
472
473
474
475
476
477
478
479
# File 'lib/riemann/tools/health.rb', line 471

def bsd_swap
  _device, blocks, used, _avail, _capacity = `swapinfo`.lines.last.split(/\s+/)

  value = Float(used) / Integer(blocks)

  report_pct :swap, value, 'used'
rescue ArgumentError
  # Ignore
end

#bsd_uptimeObject



453
454
455
456
457
# File 'lib/riemann/tools/health.rb', line 453

def bsd_uptime
  value = uptime[:uptime]

  report_uptime(value)
end

#darwin_cpuObject



383
384
385
386
387
388
389
390
# File 'lib/riemann/tools/health.rb', line 383

def darwin_cpu
  topdata = darwin_top
  unless topdata[:cpu]
    alert 'cpu', :unknown, nil, 'unable to get CPU stats from top'
    return false
  end
  report_pct :cpu, topdata[:cpu], "usage\n\n#{reverse_numeric_sort_with_header(`ps -eo pcpu,pid,comm`)}"
end

#darwin_loadObject



392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# File 'lib/riemann/tools/health.rb', line 392

def darwin_load
  topdata = darwin_top
  unless topdata[:load]
    alert 'load', :unknown, nil, 'unable to get load ave from top'
    return false
  end
  metric = topdata[:load] / @cores
  if metric > @limits[:load][:critical]
    alert 'load', :critical, metric, "1-minute load average per core is #{metric}"
  elsif metric > @limits[:load][:warning]
    alert 'load', :warning, metric, "1-minute load average per core is #{metric}"
  else
    alert 'load', :ok, metric, "1-minute load average per core is #{metric}"
  end
end

#darwin_memoryObject



408
409
410
411
412
413
414
415
# File 'lib/riemann/tools/health.rb', line 408

def darwin_memory
  topdata = darwin_top
  unless topdata[:memory]
    alert 'memory', :unknown, nil, 'unable to get memory data from top'
    return false
  end
  report_pct :memory, topdata[:memory], "usage\n\n#{reverse_numeric_sort_with_header(`ps -eo pmem,pid,comm`)}"
end

#darwin_topObject



354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
# File 'lib/riemann/tools/health.rb', line 354

def darwin_top
  return @cached_data[:darwin_top] if @cached_data[:darwin_top]

  raw = `top -l 1 | grep -i "^\\(cpu\\|physmem\\|load\\)"`.chomp
  topdata = {}
  raw.each_line do |ln|
    if ln.match(/Load Avg: [0-9.]+, [0-9.]+, ([0-9.])+/i)
      topdata[:load] = Regexp.last_match(1).to_f
    elsif ln.match(/CPU usage: [0-9.]+% user, [0-9.]+% sys, ([0-9.]+)% idle/i)
      topdata[:cpu] = 1 - (Regexp.last_match(1).to_f / 100)
    elsif (mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) wired, ([0-9]+)([BKMGT]) active, ([0-9]+)([BKMGT]) inactive, ([0-9]+)([BKMGT]) used, ([0-9]+)([BKMGT]) free/i))
      wired = mdat[1].to_i * (1024**'BKMGT'.index(mdat[2]))
      active = mdat[3].to_i * (1024**'BKMGT'.index(mdat[4]))
      inactive = mdat[5].to_i * (1024**'BKMGT'.index(mdat[6]))
      used = mdat[7].to_i * (1024**'BKMGT'.index(mdat[8]))
      free = mdat[9].to_i * (1024**'BKMGT'.index(mdat[10]))
      topdata[:memory] = (wired + active + used).to_f / (wired + active + used + inactive + free)
    # This is for OSX Mavericks which
    # uses a different format for top
    # Example: PhysMem: 4662M used (1328M wired), 2782M unused.
    elsif (mdat = ln.match(/PhysMem: ([0-9]+)([BKMGT]) used \([0-9]+[BKMGT] wired\), ([0-9]+)([BKMGT]) unused/i))
      used = mdat[1].to_i * (1024**'BKMGT'.index(mdat[2]))
      unused = mdat[3].to_i * (1024**'BKMGT'.index(mdat[4]))
      topdata[:memory] = used.to_f / (used + unused)
    end
  end
  @cached_data[:darwin_top] = topdata
end

#dfObject



417
418
419
420
421
422
423
424
425
426
427
428
429
430
# File 'lib/riemann/tools/health.rb', line 417

def df
  case @ostype
  when 'darwin', 'freebsd', 'openbsd'
    `df -Pk -t no#{opts[:disk_ignorefs].join(',')}`
  when 'sunos'
    `df -Pk` # Is there a good way to exlude iso9660 here?
  else
    if @supports_exclude_type
      `df -Pk #{opts[:disk_ignorefs].map { |fstype| "--exclude-type=#{fstype}" }.join(' ')}`
    else
      `df -Pk`
    end
  end
end

#diskObject



432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
# File 'lib/riemann/tools/health.rb', line 432

def disk
  df.lines[1..].each do |r|
    f = r.split(/\s+/)

    # Calculate capacity
    used = f[2].to_i
    available = f[3].to_i
    total_without_reservation = used + available

    x = used.to_f / total_without_reservation

    if x > @limits[:disk][:critical] && available < @limits[:disk][:critical_leniency_kb]
      alert "disk #{f[5]}", :critical, x, "#{f[4]} used"
    elsif x > @limits[:disk][:warning] && available < @limits[:disk][:warning_leniency_kb]
      alert "disk #{f[5]}", :warning, x, "#{f[4]} used"
    else
      alert "disk #{f[5]}", :ok, x, "#{f[4]} used, #{number_to_human_size(available * 1024, :floor)} free"
    end
  end
end

#freebsd_cpuObject



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# File 'lib/riemann/tools/health.rb', line 247

def freebsd_cpu
  u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map(&:to_i) # FreeBSD has 5 cpu stats

  if @old_cpu
    u1, n1, s1, t1, i1 = @old_cpu

    used = (u2 + n2 + s2 + t2) - (u1 + n1 + s1 + t1)
    total = used + i2 - i1
    fraction = used.to_f / total

    report_pct :cpu, fraction,
               "user+nice+sytem+interrupt\n\n#{reverse_numeric_sort_with_header(`ps -axo pcpu,pid,comm`)}"
  end

  @old_cpu = [u2, n2, s2, t2, i2]
end

#freebsd_memoryObject



332
333
334
335
336
337
# File 'lib/riemann/tools/health.rb', line 332

def freebsd_memory
  meminfo = `sysctl -n vm.stats.vm.v_page_count vm.stats.vm.v_wire_count vm.stats.vm.v_active_count 2>/dev/null`.chomp.split
  fraction = (meminfo[1].to_f + meminfo[2].to_f) / meminfo[0].to_f

  report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -axo pmem,pid,comm`)}"
end

#human_size_to_number(value) ⇒ Object



514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
# File 'lib/riemann/tools/health.rb', line 514

def human_size_to_number(value)
  case value
  when /^\d+$/ then value.to_i
  when /^\d+k$/i then value.to_i * 1024
  when /^\d+M$/i then value.to_i * (1024**2)
  when /^\d+G$/i then value.to_i * (1024**3)
  when /^\d+T$/i then value.to_i * (1024**4)
  when /^\d+P$/i then value.to_i * (1024**5)
  when /^\d+E$/i then value.to_i * (1024**6)
  when /^\d+Z$/i then value.to_i * (1024**7)
  when /^\d+Y$/i then value.to_i * (1024**8)
  when /^\d+R$/i then value.to_i * (1024**9)
  when /^\d+Q$/i then value.to_i * (1024**10)
  else
    raise %(Malformed size "#{value}", syntax is [0-9]+[#{SI_UNITS[1..]}]?)
  end
end

#invalidate_cacheObject



551
552
553
# File 'lib/riemann/tools/health.rb', line 551

def invalidate_cache
  @cached_data = {}
end

#linux_cpuObject



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'lib/riemann/tools/health.rb', line 169

def linux_cpu
  new = File.read('/proc/stat')
  unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
    alert 'cpu', :unknown, nil, "/proc/stat doesn't include a CPU line"
    return false
  end
  u2, n2, s2, i2 = [Regexp.last_match(1), Regexp.last_match(2), Regexp.last_match(3),
                    Regexp.last_match(4),].map(&:to_i)

  if @old_cpu
    u1, n1, s1, i1 = @old_cpu

    used = (u2 + n2 + s2) - (u1 + n1 + s1)
    total = used + i2 - i1
    fraction = used.to_f / total

    report_pct :cpu, fraction, "user+nice+system\n\n#{reverse_numeric_sort_with_header(`ps -eo pcpu,pid,comm`)}"
  end

  @old_cpu = [u2, n2, s2, i2]
end

#linux_loadObject



191
192
193
194
195
196
197
198
199
200
# File 'lib/riemann/tools/health.rb', line 191

def linux_load
  load = File.read('/proc/loadavg').split(/\s+/)[0].to_f / @cores
  if load > @limits[:load][:critical]
    alert 'load', :critical, load, "1-minute load average/core is #{load}"
  elsif load > @limits[:load][:warning]
    alert 'load', :warning, load, "1-minute load average/core is #{load}"
  else
    alert 'load', :ok, load, "1-minute load average/core is #{load}"
  end
end

#linux_memoryObject



202
203
204
205
206
207
208
209
210
211
212
213
214
# File 'lib/riemann/tools/health.rb', line 202

def linux_memory
  m = File.read('/proc/meminfo').split("\n").each_with_object({}) do |line, info|
    x = line.split(/:?\s+/)
    # Assume kB...
    info[x[0]] = x[1].to_i
  end

  free = m['MemFree'] + m['Buffers'] + m['Cached'] + linux_zfs_arc_evictable_memory
  total = m['MemTotal']
  fraction = 1 - (free.to_f / total)

  report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -eo pmem,pid,comm`)}"
end

#linux_running_in_container?Boolean

Returns:

  • (Boolean)


164
165
166
167
# File 'lib/riemann/tools/health.rb', line 164

def linux_running_in_container?
  @linux_running_in_container = File.readlink('/proc/self/ns/pid') != "pid:[#{PROC_PID_INIT_INO}]" if @linux_running_in_container.nil?
  @linux_running_in_container
end

#linux_swapObject



481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
# File 'lib/riemann/tools/health.rb', line 481

def linux_swap
  total_size = 0.0
  total_used = 0.0

  File.read('/proc/swaps').lines.each_with_index do |line, n|
    next if n.zero?

    _filename, _type, size, used, _priority = line.split(/\s+/)

    total_size += size.to_f
    total_used += used.to_f
  end

  return if total_size.zero?

  value = total_used / total_size

  report_pct :swap, value, 'used'
end

#linux_uptimeObject



459
460
461
462
463
# File 'lib/riemann/tools/health.rb', line 459

def linux_uptime
  value = File.read('/proc/uptime').split(/\s+/)[0].to_f

  report_uptime(value)
end

#linux_zfs_arc_evictable_memoryObject

On Linux, the ZFS ARC is reported as used, not as cached memory. github.com/openzfs/zfs/issues/10251

Gather ZFS ARC statisticts about evictable memory. The available fields are listed here: github.com/openzfs/zfs/blob/master/include/sys/arc_impl.h



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
# File 'lib/riemann/tools/health.rb', line 222

def linux_zfs_arc_evictable_memory
  # When the system is a container, it can access the hosts stats that
  # cause invalid memory usage reporting.  We should only remove
  # evictable memory from the ZFS ARC on the host system.
  return 0 if linux_running_in_container?

  m = File.readlines('/proc/spl/kstat/zfs/arcstats').each_with_object(Hash.new(0)) do |line, info|
    x = line.split(/\s+/)
    info[x[0]] = x[2].to_i
  end

  (
    m['anon_evictable_data'] +
    m['anon_evictable_metadata'] +
    m['mru_evictable_data'] +
    m['mru_evictable_metadata'] +
    m['mfu_evictable_data'] +
    m['mfu_evictable_metadata'] +
    m['uncached_evictable_data'] +
    m['uncached_evictable_metadata']
  ) / 1024 # We want kB...
rescue Errno::ENOENT
  0
end

#number_to_human_size(value, rounding = :round) ⇒ Object



532
533
534
535
536
537
# File 'lib/riemann/tools/health.rb', line 532

def number_to_human_size(value, rounding = :round)
  return value.to_s if value < 1024

  r = Math.log(value, 1024).floor
  format('%<size>.1f%<unit>ciB', size: (value.to_f / (1024**r)).send(rounding, 1), unit: SI_UNITS[r])
end

#openbsd_cpuObject



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# File 'lib/riemann/tools/health.rb', line 264

def openbsd_cpu
  u2, n2, s2, t2, i2 = # OpenBSD separates with ,
    `sysctl -n kern.cp_time 2>/dev/null`.split(',').map(&:to_i)
  if @old_cpu
    u1, n1, s1, t1, i1 = @old_cpu

    used = (u2 + n2 + s2 + t2) - (u1 + n1 + s1 + t1)
    total = used + i2 - i1
    fraction = used.to_f / total

    report_pct :cpu, fraction,
               "user+nice+sytem+interrupt\n\n#{reverse_numeric_sort_with_header(`ps -axo pcpu,pid,comm`)}"
  end

  @old_cpu = [u2, n2, s2, t2, i2]
end

#openbsd_memoryObject



339
340
341
342
343
344
# File 'lib/riemann/tools/health.rb', line 339

def openbsd_memory
  meminfo = `vmstat 2>/dev/null`.chomp.split
  fraction = meminfo[28].to_f / meminfo[29] # The ratio of active to free memory unlike the others :(

  report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -axo pmem,pid,comm`)}"
end

#report_int(service, value, report) ⇒ Object



138
139
140
141
142
143
144
145
146
147
148
# File 'lib/riemann/tools/health.rb', line 138

def report_int(service, value, report)
  return unless value

  if value >= @limits[service][:critical]
    alert service, :critical, value, "#{value} #{report}"
  elsif value >= @limits[service][:warning]
    alert service, :warning, value, "#{value} #{report}"
  else
    alert service, :ok, value, "#{value} #{report}"
  end
end

#report_pct(service, fraction, report) ⇒ Object



126
127
128
129
130
131
132
133
134
135
136
# File 'lib/riemann/tools/health.rb', line 126

def report_pct(service, fraction, report)
  return unless fraction

  if fraction > @limits[service][:critical]
    alert service, :critical, fraction, "#{format('%.2f', fraction * 100)}% #{report}"
  elsif fraction > @limits[service][:warning]
    alert service, :warning, fraction, "#{format('%.2f', fraction * 100)}% #{report}"
  else
    alert service, :ok, fraction, "#{format('%.2f', fraction * 100)}% #{report}"
  end
end

#report_uptime(uptime) ⇒ Object



150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/riemann/tools/health.rb', line 150

def report_uptime(uptime)
  return unless uptime

  description = uptime_to_human(uptime)

  if uptime < @limits[:uptime][:critical]
    alert 'uptime', :critical, uptime, description
  elsif uptime < @limits[:uptime][:warning]
    alert 'uptime', :warning, uptime, description
  else
    alert 'uptime', :ok, uptime, description
  end
end

#sunos_cpuObject



281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
# File 'lib/riemann/tools/health.rb', line 281

def sunos_cpu
  mpstats = `mpstat -a 2>/dev/null`.split
  u2 = mpstats[29].to_i
  s2 = mpstats[30].to_i
  t2 = mpstats[31].to_i
  i2 = mpstats[32].to_i

  if @old_cpu
    u1, s1, t1, i1 = @old_cpu

    used = (u2 + s2 + t2) - (u1 + s1 + t1)
    total = used + i2 - i1
    fraction = if i2 == i1 && used.zero? # If the system is <1% used in both samples then total will be 0 + (99 - 99), avoid a div by 0
                 0
               else
                 used.to_f / total
               end

    report_pct :cpu, fraction,
               "user+sytem+interrupt\n\n#{reverse_numeric_sort_with_header(`ps -ao pcpu,pid,comm`)}"
  end

  @old_cpu = [u2, s2, t2, i2]
end

#sunos_memoryObject



346
347
348
349
350
351
352
# File 'lib/riemann/tools/health.rb', line 346

def sunos_memory
  meminfo = `vmstat 2>/dev/null`.chomp.split
  total_mem = `prtconf | grep Memory`.split[2].to_f * 1024 # reports in GB but vmstat is in MB
  fraction = (total_mem - meminfo[32].to_f) / total_mem

  report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -ao pmem,pid,comm`)}"
end

#tickObject



539
540
541
542
543
544
545
546
547
548
549
# File 'lib/riemann/tools/health.rb', line 539

def tick
  invalidate_cache

  @cpu.call if @cpu_enabled
  @memory.call if @memory_enabled
  @disk.call if @disk_enabled
  @load.call if @load_enabled
  @uptime.call if @uptime_enabled
  @users.call if @users_enabled
  @swap.call if @swap_enabled
end

#uptimeObject



310
311
312
313
314
315
316
317
318
319
# File 'lib/riemann/tools/health.rb', line 310

def uptime
  @cached_data[:uptime] ||= uptime_parser.parse(`uptime`)
rescue Racc::ParseError => e
  report(
    service: 'uptime',
    description: "Error parsing uptime: #{e.message}",
    state: 'critical',
  )
  @cached_data[:uptime] = {}
end

#uptime_parserObject



306
307
308
# File 'lib/riemann/tools/health.rb', line 306

def uptime_parser
  @uptime_parser ||= UptimeParser.new
end

#uptime_to_human(value) ⇒ Object



501
502
503
504
505
506
507
508
509
510
511
512
# File 'lib/riemann/tools/health.rb', line 501

def uptime_to_human(value)
  seconds = value.to_i
  days = seconds / 86_400
  seconds %= 86_400
  hrs = seconds / 3600
  seconds %= 3600
  mins = seconds / 60
  [
    ("#{days} day#{'s' if days > 1}" unless days.zero?),
    format('%<hrs>2d:%<mins>02d', hrs: hrs, mins: mins),
  ].compact.join(' ')
end

#usersObject



465
466
467
468
469
# File 'lib/riemann/tools/health.rb', line 465

def users
  value = uptime[:users]

  report_int(:users, value, "user#{'s' if value != 1}")
end