Module: MonitorServers

Defined in:
lib/switchtower/ext/monitor.rb

Constant Summary collapse

LONG_TIME_FORMAT =
"%Y-%m-%d %H:%M:%S"
SHORT_TIME_FORMAT =
"%H:%M:%S"

Instance Method Summary collapse

Instance Method Details

#date_column(operation, *args) ⇒ Object

A helper method for encapsulating the behavior of the date/time column in a report.



10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# File 'lib/switchtower/ext/monitor.rb', line 10

def date_column(operation, *args)
  case operation
  when :init
    { :width => Time.now.strftime(LONG_TIME_FORMAT).length,
      :last => nil,
      :rows => 0 }
  when :show
    state = args.first
    now  = Time.now
    date = now.strftime(
      (state[:rows] % 10 == 0 || now.day != state[:last].day) ?
        LONG_TIME_FORMAT : SHORT_TIME_FORMAT)
    state[:last] = now
    state[:rows] += 1
    "%*s" % [state[:width], date]
  else
    raise "unknown operation #{operation.inspect}"
  end
end

#headers(*args) ⇒ Object

A helper method for formatting table headers in a report.



31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# File 'lib/switchtower/ext/monitor.rb', line 31

def headers(*args)
  0.step(args.length-1, 2) do |n|
    header = args[n]
    size   = args[n+1]
    if header == "-" || header == " " 
      print header * size, "  "
    else
      print header
      padding = size - header.length - 1
      print " ", "-" * padding if padding > 0
      print "  "
    end
  end
  puts
end

#load(options = {}) ⇒ Object

Monitor the load of the servers tied to the current task.



48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/switchtower/ext/monitor.rb', line 48

def load(options={})
  servers = current_task.servers.sort
  names = servers.map { |s| s.match(/^(\w+)/)[1] }
  time = date_column(:init)
  load_column_width = "0.00".length * 3 + 2

  puts "connecting..."
  connect!

  parser = Proc.new { |text| text.match(/averages: (.*)$/)[1].split(/, /) }
  delay = (options[:delay] || 30).to_i

  running = true
  trap("INT") { running = false; puts "[stopping]" }

  # THE HEADER
  header = Proc.new do
    puts
    headers("-", time[:width], *names.map { |n| [n, load_column_width] }.flatten)
  end

  while running
    uptimes = {}
    run "uptime" do |ch, stream, data|
      raise "error: #{data}" if stream == :err
      uptimes[ch[:host]] = parser[data.strip]
    end

    # redisplay the header every 40 rows
    header.call if time[:rows] % 40 == 0

    print(date_column(:show, time), "  ")
    servers.each { |server| print(uptimes[server].join("/"), "  ") }
    puts

    # sleep this way, so that CTRL-C works immediately
    delay.times { sleep 1; break unless running }
  end
end

#put_asset(name, to) ⇒ Object



214
215
216
# File 'lib/switchtower/ext/monitor.rb', line 214

def put_asset(name, to)
  put(File.read("#{File.dirname(__FILE__)}/assets/#{name}"), to)
end

#requests_per_second(*logs) ⇒ Object

Monitor the number of requests per second being logged on the various servers.



90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'lib/switchtower/ext/monitor.rb', line 90

def requests_per_second(*logs)
  # extract our configurable options from the arguments
  options = logs.last.is_a?(Hash) ? logs.pop : {}
  request_pattern = options[:request_pattern] || "Completed in [0-9]"
  sample_size = options[:sample_size] || 5
  stats_to_show = options[:stats] || [0, 1, 5, 15]
  num_format = options[:format] || "%4.1f"

  # set up the date column formatter, and get the list of servers
  time = date_column(:init)
  servers = current_task.servers.sort

  # initialize various helper variables we'll be using
  mutex = Mutex.new
  count = Hash.new(0)
  running = false
  channels = {}

  windows = Hash.new { |h,k|
    h[k] = {
      1  => [], # last 1 minute
      5  => [], # last 5 minutes
      15 => []  # last 15 minutes
    }
  }

  minute_1 = 60 / sample_size
  minute_5 = 300 / sample_size
  minute_15 = 900 / sample_size

  # set up (but don't start) the runner thread, which accumulates request
  # counts from the servers.
  runner = Thread.new do Thread.stop
    running = true
    run("echo 0 && tail -F #{logs.join(" ")} | ruby /tmp/request-counter.rb '#{request_pattern}'") do |ch, stream, out|
      channels[ch[:host]] ||= ch
      puts "#{ch[:host]}: #{out}" and break if stream == :err
      mutex.synchronize { count[ch[:host]] += out.to_i }
    end
    running = false
  end

  # store our helper script on the servers. This script reduces the amount
  # of traffic caused by tailing busy logs across the network, and also reduces
  # the amount of work the client has to do.
  put_asset "request-counter.rb", "/tmp/request-counter.rb"

  # let the runner thread get started
  runner.wakeup
  sleep 0.01 while !running

  # trap interrupt for graceful shutdown
  trap("INT") { puts "[stopping]"; channels.values.each { |ch| ch.close; ch[:status] = 0 } }

  # compute the stuff we need to know for displaying the header
  num_len = (num_format % 1).length
  column_width = num_len * (servers.length + 1) + servers.length
  abbvs = servers.map { |server| server.match(/^(\w+)/)[1][0,num_len] }
  col_header = abbvs.map { |v| "%-*s" % [num_len, v] }.join("/")

  # write both rows of the header
  stat_columns = stats_to_show.map { |n|
      case n
      when 0 then "#{sample_size} sec"
      when 1 then "1 min"
      when 5 then "5 min"
      when 15 then "15 min"
      else raise "unknown statistic #{n.inspect}"
      end
    }

  header = Proc.new do
    puts
    headers(" ", time[:width], *stat_columns.map { |v| [v, column_width] }.flatten)
    headers("-", time[:width], *([col_header, column_width] * stats_to_show.length))
  end
  
  while running
    # sleep for the specified sample size (5s by default)
    (sample_size * 2).times { sleep(0.5); break unless running }
    break unless running

    # lock the counters and compute our stats at this point in time
    mutex.synchronize do
      totals = Hash.new { |h,k| h[k] = Hash.new(0) }

      # for each server...
      count.each do |k,c|
        # push the latest sample onto the tracking queues
        windows[k][1] = windows[k][1].push(count[k]).last(minute_1)
        windows[k][5] = windows[k][5].push(count[k]).last(minute_5)
        windows[k][15] = windows[k][15].push(count[k]).last(minute_15)

        # compute the stats for this server (k)
        totals[k][0] = count[k].to_f / sample_size
        totals[k][1] = windows[k][1].inject(0) { |n,i| n + i } / (windows[k][1].length * sample_size).to_f
        totals[k][5] = windows[k][5].inject(0) { |n,i| n + i } / (windows[k][5].length * sample_size).to_f
        totals[k][15] = windows[k][15].inject(0) { |n,i| n + i } / (windows[k][15].length * sample_size).to_f

        # add those stats to the totals per category
        totals[:total][0] += totals[k][0]
        totals[:total][1] += totals[k][1]
        totals[:total][5] += totals[k][5]
        totals[:total][15] += totals[k][15]
      end

      # redisplay the header every 40 rows
      header.call if time[:rows] % 40 == 0

      # show the stats
      print(date_column(:show, time))
      stats_to_show.each do |stat|
        print "  "
        servers.each { |server| print "#{num_format}/" % totals[server][stat] }
        print(num_format % totals[:total][stat])
      end
      puts

      # reset the sample counter
      count = Hash.new(0)
    end
  end
end