Class: Bosh::Director::ProblemScanner

Inherits:

Object

Object
Bosh::Director::ProblemScanner

show all

Defined in:: lib/bosh/director/problem_scanner.rb

Constant Summary collapse

AGENT_TIMEOUT = seconds

Instance Attribute Summary collapse

#event_log ⇒ Object readonly

Returns the value of attribute event_log.
#logger ⇒ Object readonly

Returns the value of attribute logger.

Instance Method Summary collapse

#begin_stage(stage_name, n_steps) ⇒ Object
#initialize(deployment) ⇒ ProblemScanner constructor

A new instance of ProblemScanner.
#problem_found(type, resource, data = {}) ⇒ Object
#reset(vms = nil) ⇒ Object
#scan_disk(disk) ⇒ Object
#scan_disks ⇒ Object
#scan_vm(vm) ⇒ Object
#scan_vms(vms = nil) ⇒ Object
#track_and_log(task, log = true) ⇒ Object

Constructor Details

#initialize(deployment) ⇒ `ProblemScanner`

Returns a new instance of ProblemScanner.

Parameters:

deployment_name (String) —

Deployment name

# File 'lib/bosh/director/problem_scanner.rb', line 13

def initialize(deployment)
  @deployment = deployment
  @instance_manager = Api::InstanceManager.new

  @problem_lock = Mutex.new
  @agent_disks = {}

  #temp
  @event_log = Config.event_log
  @logger = Config.logger
end

Instance Attribute Details

#event_log ⇒ `Object` (readonly)

Returns the value of attribute event_log.



8
9
10

# File 'lib/bosh/director/problem_scanner.rb', line 8

def event_log
  @event_log
end

#logger ⇒ `Object` (readonly)

Returns the value of attribute logger.



8
9
10

# File 'lib/bosh/director/problem_scanner.rb', line 8

def logger
  @logger
end

Instance Method Details

#begin_stage(stage_name, n_steps) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 25

def begin_stage(stage_name, n_steps)
  event_log.begin_stage(stage_name, n_steps)
  logger.info(stage_name)
end

#problem_found(type, resource, data = {}) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 191

def problem_found(type, resource, data = {})
  @problem_lock.synchronize do
    similar_open_problems = Models::DeploymentProblem.
        filter(:deployment_id => deployment.id, :type => type.to_s,
               :resource_id => resource.id, :state => "open").all

    if similar_open_problems.size > 1
      raise CloudcheckTooManySimilarProblems,
            "More than one problem of type `#{type}' " +
                "exists for resource #{type} #{resource.id}"
    end

    if similar_open_problems.empty?
      problem = Models::DeploymentProblem.
          create(:type => type.to_s, :resource_id => resource.id,
                 :state => "open", :deployment_id => deployment.id,
                 :data => data, :counter => 1)

      logger.info("Created problem #{problem.id} (#{problem.type})")
    else
      # This assumes we are running with deployment lock acquired,
      # so there is no possible update conflict
      problem = similar_open_problems[0]
      problem.data = data
      problem.last_seen_at = Time.now
      problem.counter += 1
      problem.save
      logger.info("Updated problem #{problem.id} (#{problem.type}), " +
                      "count is now #{problem.counter}")
    end
  end
end

#reset(vms = nil) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 37

def reset(vms=nil)
  if vms
    vms.each do |job, index|
      instance = @instance_manager.find_by_name(@deployment.name, job, index)
      Models::DeploymentProblem.where(deployment: deployment,
                                      :resource_id => instance.vm.id,
                                      :state => "open").update(state: "closed")
    end
  else
    Models::DeploymentProblem.where(state: "open", deployment: deployment).update(state: "closed")
  end
end

#scan_disk(disk) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 104

def scan_disk(disk)
  # inactive disks
  unless disk.active
    logger.info("Found inactive disk: #{disk.id}")
    problem_found(:inactive_disk, disk)
    return :inactive
  end

  disk_cid = disk.disk_cid
  vm_cid = nil

  if disk.instance && disk.instance.vm
    vm_cid = disk.instance.vm.cid
  end

  if vm_cid.nil?
    # With the db dependencies this should not happen.
    logger.warn("Disk #{disk_cid} is not associated to any VM. " +
                    "Skipping scan")
    return :ok
  end

  owner_vms = get_disk_owners(disk_cid) || []
  # active disk is not mounted or mounted more than once -or-
  # the disk is mounted on a vm that is different form the record.
  if owner_vms.size != 1 || owner_vms.first != vm_cid
    logger.info("Found problem in mount info: " +
                    "active disk #{disk_cid} mounted on " +
                    "#{owner_vms.join(', ')}")
    problem_found(:mount_info_mismatch, disk, :owner_vms => owner_vms)
    return :mount_info_mismatch
  end
  :ok
end

#scan_disks ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 50

def scan_disks
  disks = Models::PersistentDisk.eager(:instance).all.select do |disk|
    disk.instance && disk.instance.deployment_id == deployment.id
  end
  results = Hash.new(0)

  begin_stage("Scanning #{disks.size} persistent disks", 2)

  track_and_log("Looking for inactive disks") do
    disks.each do |disk|
      scan_result = scan_disk(disk)
      results[scan_result] += 1
    end
  end

  track_and_log("#{results[:ok]} OK, " +
                    "#{results[:inactive]} inactive, " +
                    "#{results[:mount_info_mismatch]} mount-info mismatch")
end

#scan_vm(vm) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 139

def scan_vm(vm)
  agent_options = {
      :timeout => AGENT_TIMEOUT,
      :retry_methods => {:get_state => 0}
  }

  instance = nil
  mounted_disk_cid = nil
  @problem_lock.synchronize do
    instance = vm.instance
    mounted_disk_cid = instance.persistent_disk_cid if instance
  end

  agent = AgentClient.with_defaults(vm.agent_id, agent_options)
  begin
    state = agent.get_state

    # gather mounted disk info. (used by scan_disk)
    begin
      disk_list = agent.list_disk
      mounted_disk_cid = disk_list.first
    rescue Bosh::Director::RpcTimeout => e
      mounted_disk_cid = nil
    rescue RuntimeError => e
      # For old agents that doesn't implement list_disk we assume the disk is mounted
      logger.info("agent.list_disk failed on agent #{vm.agent_id}")
    end
    add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid

    return :out_of_sync if is_out_of_sync_vm?(vm, instance, state)
    return :unbound if is_unbound_instance_vm?(vm, instance, state)
    :ok
  rescue Bosh::Director::RpcTimeout
    # We add the disk to avoid a duplicate problem when timeouts fetching agent status (unresponsive_agent and
    # mount_info_mismatch)
    add_disk_owner(mounted_disk_cid, vm.cid) if mounted_disk_cid

    begin
      unless cloud.has_vm?(vm.cid)
        logger.info("Missing VM #{vm.cid}")
        problem_found(:missing_vm, vm)
        return :missing
      end
    rescue Bosh::Clouds::NotImplemented
    end

    logger.info("Found unresponsive agent #{vm.agent_id}")
    problem_found(:unresponsive_agent, vm)
    :unresponsive
  end
end

#scan_vms(vms = nil) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 70

def scan_vms(vms=nil)
  if vms
    vm_list = []
    vms.each do |job, index|
      instance = @instance_manager.find_by_name(@deployment.name, job, index)
      vm_list << instance.vm
    end
    vms = vm_list
  else
    vms = Models::Vm.eager(:instance).filter(deployment: deployment).all
  end

  begin_stage("Scanning #{vms.size} VMs", 2)
  results = Hash.new(0)
  lock = Mutex.new

  track_and_log("Checking VM states") do
    ThreadPool.new(:max_threads => Config.max_threads).wrap do |pool|
      vms.each do |vm|
        pool.process do
          scan_result = scan_vm(vm)
          lock.synchronize { results[scan_result] += 1 }
        end
      end
    end
  end

  track_and_log("#{results[:ok]} OK, " +
                    "#{results[:unresponsive]} unresponsive, " +
                    "#{results[:missing]} missing, " +
                    "#{results[:unbound]} unbound, " +
                    "#{results[:out_of_sync]} out of sync")
end

#track_and_log(task, log = true) ⇒ `Object`

# File 'lib/bosh/director/problem_scanner.rb', line 30

def track_and_log(task, log = true)
  event_log.track(task) do |ticker|
    logger.info(task) if log
    yield ticker if block_given?
  end
end

Class: Bosh::Director::ProblemScanner

Constant Summary collapse

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(deployment) ⇒ ProblemScanner

Instance Attribute Details

#event_log ⇒ Object (readonly)

#logger ⇒ Object (readonly)

Instance Method Details

#begin_stage(stage_name, n_steps) ⇒ Object

#problem_found(type, resource, data = {}) ⇒ Object

#reset(vms = nil) ⇒ Object

#scan_disk(disk) ⇒ Object

#scan_disks ⇒ Object

#scan_vm(vm) ⇒ Object

#scan_vms(vms = nil) ⇒ Object

#track_and_log(task, log = true) ⇒ Object

#initialize(deployment) ⇒ `ProblemScanner`

#event_log ⇒ `Object` (readonly)

#logger ⇒ `Object` (readonly)

#begin_stage(stage_name, n_steps) ⇒ `Object`

#problem_found(type, resource, data = {}) ⇒ `Object`

#reset(vms = nil) ⇒ `Object`

#scan_disk(disk) ⇒ `Object`

#scan_disks ⇒ `Object`

#scan_vm(vm) ⇒ `Object`

#scan_vms(vms = nil) ⇒ `Object`

#track_and_log(task, log = true) ⇒ `Object`