Class: ScbiMapreduce::WorkManager

Inherits:

EventMachine::Connection

Object
EventMachine::Connection
ScbiMapreduce::WorkManager

show all

Includes:: EM::P::ObjectProtocol

Defined in:: lib/scbi_mapreduce/work_manager.rb

Overview

require ‘json’

Class Method Summary collapse

Instance Method Summary collapse

#checkpointable_job_received(obj) ⇒ Object
#error_received(worker_error, obj) ⇒ Object
#goto_checkpoint ⇒ Object

loads a checkpoint.
#initialize(*args) ⇒ WorkManager constructor

A new instance of WorkManager.
#load_user_checkpoint(checkpoint) ⇒ Object

if this function returns -1, then automatic checkpointing is done.
#next_work ⇒ Object
#post_init ⇒ Object
#print_running_jobs ⇒ Object
#read_until_checkpoint(checkpoint) ⇒ Object
#receive_object(obj) ⇒ Object
#remove_checkpoint ⇒ Object
#save_checkpoint ⇒ Object
#save_user_checkpoint ⇒ Object
#send_initial_config ⇒ Object
#send_next_work ⇒ Object

send next work to worker.
#send_stuck_work ⇒ Object
#stop_work_manager ⇒ Object
#too_many_errors_received ⇒ Object
#trash_checkpointed_work ⇒ Object
#unbind ⇒ Object

A worker has disconected.
#work_received(obj) ⇒ Object
#worker_initial_config ⇒ Object

Constructor Details

#initialize(*args) ⇒ `WorkManager`

Returns a new instance of WorkManager.

# File 'lib/scbi_mapreduce/work_manager.rb', line 473

def initialize(*args)
  super
  #puts "WORK MANAGER INITIALIZE NEWWWWWWWWWW, ONE per worker"
end

Class Method Details

.checkpoint ⇒ `Object`



168
169
170

# File 'lib/scbi_mapreduce/work_manager.rb', line 168

def self.checkpoint
  return @@checkpoint
end

.controlled_exit ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 356

def self.controlled_exit
  $SERVER_LOG.info("Controlled exit. Workers will be noticed in next round")
  @@want_to_exit=true
end

.end_work_manager ⇒ `Object`



92
93
94

# File 'lib/scbi_mapreduce/work_manager.rb', line 92

def self.end_work_manager

end

.get_checkpoint ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 198

def self.get_checkpoint
  res = 0
  begin
    if File.exists?(CHECKPOINT_FILE)
      res=File.read(CHECKPOINT_FILE).chomp
      # puts "read checkpoint #{res}"

      res = res.to_i
    end
  rescue
    res = 0
  end

  return res
end

.init_work_manager ⇒ `Object`



88
89
90

# File 'lib/scbi_mapreduce/work_manager.rb', line 88

def self.init_work_manager

end

.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs, exit_on_many_errors, chunk_size) ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 136

def self.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs,exit_on_many_errors,chunk_size)
  @@count = 0
  @@want_to_exit=false
  @@chunk_count = 0
  @@workers = 0
  @@max_workers = 0
  @@error_count = 0
  @@running_jobs=[]
  # @@compress=true

  @@checkpointing=checkpointing
  @@keep_order=keep_order
  @@retry_stuck_jobs=retry_stuck_jobs
  @@exit_on_many_errors=exit_on_many_errors

  # TODO - Implement a dynamic chunk_size

  @@chunk_size=chunk_size
  $SERVER_LOG.info "Processing in chunks of #{@@chunk_size} objects"
  $SERVER_LOG.info "Checkpointing: #{@@checkpointing}"
  $SERVER_LOG.info "Keeping output order: #{@@keep_order}"
  $SERVER_LOG.info "Retrying stuck jobs: #{@@retry_stuck_jobs}"
  $SERVER_LOG.info "Exiting on too many errors: #{@@exit_on_many_errors}"

  @@checkpoint=0
  if @@checkpointing
    @@checkpoint=self.get_checkpoint
    $SERVER_LOG.info "Detected checkpoint at #{@@checkpoint}"
  end

end

Instance Method Details

#checkpointable_job_received(obj) ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 404

def checkpointable_job_received(obj)

  # find reveived object between sent jobs
  received_job=@@running_jobs.find{|o| o.job_identifier==obj.job_identifier}

  # save job if there is was a valid work previously sent
  if received_job

    # change this job's status to received
    received_job.received!(obj.data)



    # # if there are sufficient jobs, count pending ones
    # if (@@running_jobs.count>=PENDING_TO_SAVE)

    # count received objects pending to be written, only until one that is still running is found
    pending_to_save=0
    @@running_jobs.each do |job|
      if job.status==:received
        pending_to_save += 1
      else
        break
      end
    end

    # if there are a few pending to save works, or all remaining works are pending, then save
    if (pending_to_save>=PENDING_TO_SAVE) || (pending_to_save==@@running_jobs.count)
      # save pending jobs and write to disk
      to_remove = 0
      
      if @@checkpointing
        remove_checkpoint
      end
      
      @@running_jobs.each do |job|
        if job.status==:received
          # puts "Sent to save: #{job.inspect}"
          work_received(job.data)
          job.status=:saved
          to_remove += 1
        else
          break
        end
      end

      # if some objects were saved, remove them from the running_jobs
      if to_remove > 0
        to_remove.times do |i|
          o=@@running_jobs.shift

          # puts "Job removed #{o.inspect}"
          o=nil
        end

        # print_running_jobs

        if @@checkpointing && !@@want_to_exit

          save_checkpoint
        end
      end
    end
    # end
  else
    $SERVER_LOG.warn "Job already processed #{obj.inspect}"
  end
end

#error_received(worker_error, obj) ⇒ `Object`



108
109
110

# File 'lib/scbi_mapreduce/work_manager.rb', line 108

def error_received(worker_error, obj)

end

#goto_checkpoint ⇒ `Object`

loads a checkpoint

# File 'lib/scbi_mapreduce/work_manager.rb', line 300

def goto_checkpoint
  if @@checkpoint>0
    $SERVER_LOG.info "Skipping until checkpoint #{@@checkpoint}"

    checkpoint=load_user_checkpoint(@@checkpoint)

    # do an automatic checkpoint restore
    if checkpoint==-1
      (@@checkpoint - 1).times do |i|
        $SERVER_LOG.info "Automatic trashing Chunk #{i+1}"
        # get next work
        @@chunk_size.times do
          obj=next_work
        end
        # trash_checkpointed_work
      end

      $SERVER_LOG.info "Automatic checkpoint finished"

      WorkManagerData.job_id=@@checkpoint

      #user has done the checkpoint restoration
    elsif checkpoint>0
      
      WorkManagerData.job_id=checkpoint
      
    elsif checkpoint==0
      $SERVER_LOG.info "Automatic checkpoint not done"
    end


    @@checkpoint=0

  end

end

#load_user_checkpoint(checkpoint) ⇒ `Object`

if this function returns -1, then automatic checkpointing is done. Return 0 to no checkpointing. Return the restored checkpoint number to start in this point.



123
124
125

# File 'lib/scbi_mapreduce/work_manager.rb', line 123

def load_user_checkpoint(checkpoint)
  return -1
end

#next_work ⇒ `Object`



96
97
98

# File 'lib/scbi_mapreduce/work_manager.rb', line 96

def next_work

end

#post_init ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 337

def post_init
  @@workers += 1
  @@max_workers +=1
  # when first worker is connected, do special config
  if @@workers == 1
    @@total_seconds = Time.now
    $SERVER_LOG.info "First worker connected"

    if @@checkpointing
      $SERVER_LOG.info "Checking for checkpoint"
      goto_checkpoint
    end
  end

  $SERVER_LOG.info "#{@@workers} workers connected"
  send_initial_config
  send_next_work
end

#print_running_jobs ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 226

def print_running_jobs
  jobs=@@running_jobs.map{|j| j.inspect}.join("\n")
  $SERVER_LOG.debug("Running Jobs:\n#{jobs}")
end

#read_until_checkpoint(checkpoint) ⇒ `Object`



116
117
118

# File 'lib/scbi_mapreduce/work_manager.rb', line 116

def read_until_checkpoint(checkpoint)

end

#receive_object(obj) ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 362

def receive_object(obj)

  # check if response is an error
  if obj.is_a?(Exception)
    $SERVER_LOG.error("Error in worker #{obj.worker_id} while processing object #{obj.object.inspect}\n" + obj.original_exception.message + ":\n" + obj.original_exception.backtrace.join("\n"))

    @@error_count += 1

    error_received(obj,obj.object.data)

    # if there are too many errors
    if (@@count>100) && (@@error_count >= @@count*0.8)

      # notice programmer
      res=too_many_errors_received

      # force exit if too_many_errors_received returns true
      if @@exit_on_many_errors || res
        $SERVER_LOG.error("Want to exit due to too many errors")
        self.controlled_exit
      end
    end

  else
    # if not using checkpointing


    if @@checkpointing || @@keep_order || @@retry_stuck_jobs
      # print_running_jobs
      checkpointable_job_received(obj)
    else
      work_received(obj.data)
    end
  end

  # free mem
  obj=nil
  send_next_work

end

#remove_checkpoint ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 172

def remove_checkpoint
  if File.exists?(CHECKPOINT_FILE)
    checkpoint_file = FileUtils.mv(CHECKPOINT_FILE,OLD_CHECKPOINT_FILE)
  end
end

#save_checkpoint ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 179

def save_checkpoint
  checkpoint_file = File.open(CHECKPOINT_FILE,'w')
  
  if !@@running_jobs.empty?
    checkpoint_value = @@running_jobs.first.job_identifier
  else
     checkpoint_value = WorkManagerData.job_id
  end
  
  $SERVER_LOG.info "Saving checkpoint: #{checkpoint_value}"
  
  checkpoint_file.puts checkpoint_value
  
  checkpoint_file.close
  
  save_user_checkpoint

end

#save_user_checkpoint ⇒ `Object`



127
128

# File 'lib/scbi_mapreduce/work_manager.rb', line 127

def save_user_checkpoint
end

#send_initial_config ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 214

def send_initial_config
  config = worker_initial_config

  if config.nil?
    obj = :no_initial_config
  else
    obj = {:initial_config => config}
  end

  send_object(obj)
end

#send_next_work ⇒ `Object`

send next work to worker

# File 'lib/scbi_mapreduce/work_manager.rb', line 254

def send_next_work

  # if we need to exit, send quit to workers
  
  if @@want_to_exit
    send_object(:quit)
    
  elsif !send_stuck_work
    
  #send stuck work
    objs=[]

    # prepare new data
    @@chunk_size.times do
      obj=next_work
      if obj.nil?
        break
      else
        # add to obj array
        objs << obj
      end
    end

    # if new was data collected, send it
    if objs.count>0
      @@count += objs.count
      @@chunk_count += 1

      work_data=WorkManagerData.new(objs)
      send_object(work_data)

      # to keep order or retry failed job, we need job status
      if @@keep_order || @@retry_stuck_jobs
        # do not remove data to be able to sent it again
        # work_data.data=nil
        @@running_jobs.push work_data
        # print_running_jobs
      end
    else
      # otherwise, send a quit value indicating no more data available
      send_object(:quit)
    end
  end
end

#send_stuck_work ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 231

def send_stuck_work
  sent=false

  if @@retry_stuck_jobs
    # count stuck jobs and re-sent the first one
    stuck_works=@@running_jobs.select{|job| job.stuck?}

    if !stuck_works.empty?
      jobs=stuck_works.map{|j| j.inspect}.join("\n")
      $SERVER_LOG.info("Stuck Jobs:\n#{jobs}")

      # send_object
      send_object(stuck_works.first)
      stuck_works.first.sent!
      $SERVER_LOG.info("Sending stuck work #{stuck_works.first.inspect}")
      sent=true
    end
  end

  return sent
end

#stop_work_manager ⇒ `Object`

# File 'lib/scbi_mapreduce/work_manager.rb', line 493

def stop_work_manager
  
  
  
  EM.stop
  $SERVER_LOG.info  "Exiting server"

  self.class.end_work_manager

  @@total_seconds = Time.now-@@total_seconds
  $SERVER_LOG.info  "Total processed: #{@@count} objects in #{@@total_seconds} seconds"
  $SERVER_LOG.info  "Processing rate: #{"%.2f" % (@@count/@@total_seconds.to_f)} objects per second"
  $SERVER_LOG.info  "Connection rate: #{"%.2f" % (@@chunk_count/@@total_seconds.to_f)} connections per second"

  $SERVER_LOG.info  "Number of errors: #{@@error_count}"
  $SERVER_LOG.info  "Chunk size: #{@@chunk_size}"
  $SERVER_LOG.info  "Total connected workers: #{@@max_workers}"
  
end

#too_many_errors_received ⇒ `Object`



112
113
114

# File 'lib/scbi_mapreduce/work_manager.rb', line 112

def too_many_errors_received

end

#trash_checkpointed_work ⇒ `Object`



130
131
132

# File 'lib/scbi_mapreduce/work_manager.rb', line 130

def trash_checkpointed_work

end

#unbind ⇒ `Object`

A worker has disconected

# File 'lib/scbi_mapreduce/work_manager.rb', line 479

def unbind

  @@workers -= 1
  #puts @@running_jobs.to_json

  $SERVER_LOG.info  "Worker disconnected. #{@@workers} kept running"

  # no more workers left, shutdown EM and stop server
  if @@workers == 0
    $SERVER_LOG.info  "All workers finished"
    stop_work_manager
  end
end

#work_received(obj) ⇒ `Object`



100
101
102

# File 'lib/scbi_mapreduce/work_manager.rb', line 100

def work_received(obj)

end

#worker_initial_config ⇒ `Object`



104
105
106

# File 'lib/scbi_mapreduce/work_manager.rb', line 104

def worker_initial_config

end

Class: ScbiMapreduce::WorkManager

Overview

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(*args) ⇒ WorkManager

Class Method Details

.checkpoint ⇒ Object

.controlled_exit ⇒ Object

.end_work_manager ⇒ Object

.get_checkpoint ⇒ Object

.init_work_manager ⇒ Object

.init_work_manager_internals(checkpointing, keep_order, retry_stuck_jobs, exit_on_many_errors, chunk_size) ⇒ Object

Instance Method Details

#checkpointable_job_received(obj) ⇒ Object

#error_received(worker_error, obj) ⇒ Object

#goto_checkpoint ⇒ Object

#load_user_checkpoint(checkpoint) ⇒ Object

#next_work ⇒ Object

#post_init ⇒ Object

#print_running_jobs ⇒ Object

#read_until_checkpoint(checkpoint) ⇒ Object

#receive_object(obj) ⇒ Object

#remove_checkpoint ⇒ Object

#save_checkpoint ⇒ Object

#save_user_checkpoint ⇒ Object

#send_initial_config ⇒ Object

#send_next_work ⇒ Object

#send_stuck_work ⇒ Object

#stop_work_manager ⇒ Object

#too_many_errors_received ⇒ Object

#trash_checkpointed_work ⇒ Object

#unbind ⇒ Object

#work_received(obj) ⇒ Object

#worker_initial_config ⇒ Object