Class: OodCore::Job::Adapters::Torque

Inherits:
OodCore::Job::Adapter show all
Defined in:
lib/ood_core/job/adapters/torque.rb,
lib/ood_core/job/adapters/torque/batch.rb,
lib/ood_core/job/adapters/torque/attributes.rb

Overview

Utility class to maintain all the Torque attributes available.

Defined Under Namespace

Classes: Batch, FFI

Constant Summary collapse

STATE_MAP =

Mapping of state characters for PBS

{
  'Q' => :queued,
  'H' => :queued_held,
  'T' => :queued_held,    # transiting (being moved to new location)
  'W' => :queued_held,    # waiting (waiting for its execution time)
  'R' => :running,
  'S' => :suspended,
  'E' => :running,        # exiting, but still running
  'C' => :completed
}
ATTR =

Maintains a constant Hash of defined PBS attribute types Includes:

Attribute names used by user commands
Additional job and general attribute names
Additional queue attribute names
Additional server attribute names
Additional node attribute names
{
  # Attribute names used by user commands
  a:                 :Execution_Time,
  c:                 :Checkpoint,
  e:                 :Error_Path,
  f:                 :fault_tolerant,
  g:                 :group_list,
  h:                 :Hold_Types,
  j:                 :Join_Path,
  k:                 :Keep_Files,
  l:                 :Resource_List,
  m:                 :Mail_Points,
  o:                 :Output_Path,
  p:                 :Priority,
  q:                 :destination,
  r:                 :Rerunable,
  t:                 :job_array_request,
  array_id:          :job_array_id,
  u:                 :User_List,
  v:                 :Variable_List,
  A:                 :Account_Name,
  args:              :job_arguments,
  reservation_id:    :reservation_id,
  login_node_id:     :login_node_id,
  login_prop:        :login_property,
  external_nodes:    :external_nodes,
  multi_req_alps:    :multi_req_alps,
  M:                 :Mail_Users,
  N:                 :Job_Name,
  S:                 :Shell_Path_List,
  depend:            :depend,
  inter:             :interactive,
  stagein:           :stagein,
  stageout:          :stageout,
  jobtype:           :jobtype,
  submit_host:       :submit_host,
  init_work_dir:     :init_work_dir,

  # Additional job and general attribute names
  ctime:             :ctime,
  exechost:          :exec_host,
  execport:          :exec_port,
  mtime:             :mtime,
  qtime:             :qtime,
  session:           :session_id,
  euser:             :euser,
  egroup:            :egroup,
  hashname:          :hashname,
  hopcount:          :hop_count,
  security:          :security,
  sched_hint:        :sched_hint,
  substate:          :substate,
  name:              :Job_Name,
  owner:             :Job_Owner,
  used:              :resources_used,
  state:             :job_state,
  queue:             :queue,
  server:            :server,
  maxrun:            :max_running,
  maxreport:         :max_report,
  total:             :total_jobs,
  comment:           :comment,
  cookie:            :cookie,
  qrank:             :queue_rank,
  altid:             :alt_id,
  etime:             :etime,
  exitstat:          :exit_status,
  forwardx11:        :forward_x11,
  submit_args:       :submit_args,
  tokens:            :tokens,
  netcounter:        :net_counter,
  umask:             :umask,
  start_time:        :start_time,
  start_count:       :start_count,
  checkpoint_dir:    :checkpoint_dir,
  checkpoint_name:   :checkpoint_name,
  checkpoint_time:   :checkpoint_time,
  checkpoint_restart_status:  :checkpoint_restart_status,
  restart_name:      :restart_name,
  comp_time:         :comp_time,
  reported:          :reported,
  intcmd:            :inter_cmd,
  job_radix:         :job_radix,
  sister_list:       :sister_list,
  total_runtime:     :total_runtime,
  P:                 :proxy_user,
  node_exclusive:    :node_exclusive,
  exec_gpus:         :exec_gpus,
  exec_mics:         :exec_mics,
  J:                 :job_id,
  pagg:              :pagg_id,
  system_start_time: :system_start_time,
  gpu_flags:         :gpu_flags,

  # Additional queue attribute names

  # Additional server attribute names

  # Additional node attribute names
}

Instance Method Summary collapse

Methods inherited from OodCore::Job::Adapter

#accounts, #cluster_info, #info_all_each, #info_where_owner_each, #job_name_illegal_chars, #nodes, #queues, #sanitize_job_name, #supports_job_arrays?

Constructor Details

#initialize(opts = {}) ⇒ Torque

This method is part of a private API. You should avoid using this method if possible, as it may be removed or be changed in the future.

Returns a new instance of Torque.

Parameters:

  • opts (#to_h) (defaults to: {})

    the options defining this adapter

Options Hash (opts):

See Also:



57
58
59
60
61
# File 'lib/ood_core/job/adapters/torque.rb', line 57

def initialize(opts = {})
  o = opts.to_h.symbolize_keys

  @pbs = o.fetch(:pbs) { raise ArgumentError, "No pbs object specified. Missing argument: pbs" }
end

Instance Method Details

#delete(id) ⇒ void

This method returns an undefined value.

Delete the submitted job

Parameters:

  • id (#to_s)

    the id of the job

Raises:

See Also:



291
292
293
294
295
296
297
298
299
# File 'lib/ood_core/job/adapters/torque.rb', line 291

def delete(id)
  @pbs.delete_job(id.to_s)
rescue Torque::FFI::UnkjobidError, Torque::FFI::BadstateError
  # assume successful job deletion if can't find job id
  # assume successful job deletion if job is exiting or completed
  nil
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#directive_prefixObject



301
302
303
# File 'lib/ood_core/job/adapters/torque.rb', line 301

def directive_prefix
  '#QSUB'
end

#hold(id) ⇒ void

This method returns an undefined value.

Put the submitted job on hold

Parameters:

  • id (#to_s)

    the id of the job

Raises:

See Also:



263
264
265
266
267
268
269
270
# File 'lib/ood_core/job/adapters/torque.rb', line 263

def hold(id)
  @pbs.hold_job(id.to_s)
rescue Torque::FFI::UnkjobidError
  # assume successful job hold if can't find job id
  nil
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#info(id) ⇒ Info

Retrieve job info from the resource manager

Parameters:

  • id (#to_s)

    the id of the job

Returns:

  • (Info)

    information describing submitted job

Raises:

See Also:



218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# File 'lib/ood_core/job/adapters/torque.rb', line 218

def info(id)
  id = id.to_s
  result = @pbs.get_job(id)

  if result.keys.length == 1
    parse_job_info(*result.flatten)
  else
    parse_job_array(id, result)
  end
rescue Torque::FFI::UnkjobidError
  # set completed status if can't find job id
  Info.new(
    id: id,
    status: :completed
  )
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#info_all(attrs: nil) ⇒ Array<Info>

Retrieve info for all jobs from the resource manager

Returns:

  • (Array<Info>)

    information describing submitted jobs

Raises:

See Also:



187
188
189
190
191
192
193
# File 'lib/ood_core/job/adapters/torque.rb', line 187

def info_all(attrs: nil)
  @pbs.get_jobs.map do |k, v|
    parse_job_info(k, v)
  end
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#info_where_owner(owner, attrs: nil) ⇒ Array<Info>

Retrieve info for all jobs for a given owner or owners from the resource manager

Parameters:

  • owner (#to_s, Array<#to_s>)

    the owner(s) of the jobs

Returns:

  • (Array<Info>)

    information describing submitted jobs

Raises:



200
201
202
203
204
205
206
207
208
209
210
211
# File 'lib/ood_core/job/adapters/torque.rb', line 200

def info_where_owner(owner, attrs: nil)
  owner = Array.wrap(owner).map(&:to_s)
  @pbs.select_jobs(
    attribs: [
      { name: "User_List", value: owner.join(","), op: :eq }
    ]
  ).map do |k, v|
    parse_job_info(k, v)
  end
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#release(id) ⇒ void

This method returns an undefined value.

Release the job that is on hold

Parameters:

  • id (#to_s)

    the id of the job

Raises:

See Also:



277
278
279
280
281
282
283
284
# File 'lib/ood_core/job/adapters/torque.rb', line 277

def release(id)
  @pbs.release_job(id.to_s)
rescue Torque::FFI::UnkjobidError
  # assume successful job release if can't find job id
  nil
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#status(id) ⇒ Status

Retrieve job status from resource manager

Parameters:

  • id (#to_s)

    the id of the job

Returns:

Raises:

See Also:



242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
# File 'lib/ood_core/job/adapters/torque.rb', line 242

def status(id)
  id = id.to_s
  @pbs.get_job(id, filters: [:job_state]).values.map {
    |job_status| OodCore::Job::Status.new(
      state: STATE_MAP.fetch(
        job_status[:job_state], :undetermined
      )
    )
  }.max
rescue Torque::FFI::UnkjobidError
  # set completed status if can't find job id
  Status.new(state: :completed)
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end

#submit(script, after: [], afterok: [], afternotok: [], afterany: []) ⇒ String

Submit a job with the attributes defined in the job template instance

Parameters:

  • script (Script)

    script object that describes the script and attributes for the submitted job

  • after (#to_s, Array<#to_s>) (defaults to: [])

    this job may be scheduled for execution at any point after dependent jobs have started execution

  • afterok (#to_s, Array<#to_s>) (defaults to: [])

    this job may be scheduled for execution only after dependent jobs have terminated with no errors

  • afternotok (#to_s, Array<#to_s>) (defaults to: [])

    this job may be scheduled for execution only after dependent jobs have terminated with errors

  • afterany (#to_s, Array<#to_s>) (defaults to: [])

    this job may be scheduled for execution after dependent jobs have terminated

Returns:

  • (String)

    the job id returned after successfully submitting a job

Raises:

See Also:



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# File 'lib/ood_core/job/adapters/torque.rb', line 77

def submit(script, after: [], afterok: [], afternotok: [], afterany: [])
  after      = Array(after).map(&:to_s)
  afterok    = Array(afterok).map(&:to_s)
  afternotok = Array(afternotok).map(&:to_s)
  afterany   = Array(afterany).map(&:to_s)

  # Set dependencies
  depend = []
  depend << "after:#{after.join(':')}"           unless after.empty?
  depend << "afterok:#{afterok.join(':')}"       unless afterok.empty?
  depend << "afternotok:#{afternotok.join(':')}" unless afternotok.empty?
  depend << "afterany:#{afterany.join(':')}"     unless afterany.empty?

  # Set mailing options
  mail_points = ""
  mail_points += "b" if script.email_on_started
  mail_points += "e" if script.email_on_terminated

  # FIXME: Remove the Hash option once all Interactive Apps are
  # converted to Array format
  if script.native.is_a?(Hash)
    # Set headers
    headers = {}
    headers.merge!(job_arguments: script.args.join(' ')) unless script.args.nil?
    headers.merge!(Hold_Types: :u) if script.submit_as_hold
    headers.merge!(Rerunable: script.rerunnable ? 'y' : 'n') unless script.rerunnable.nil?
    headers.merge!(init_work_dir: script.workdir) unless script.workdir.nil?
    headers.merge!(Mail_Users: script.email.join(',')) unless script.email.nil?
    headers.merge!(Mail_Points: mail_points) unless mail_points.empty?
    headers.merge!(Job_Name: script.job_name) unless script.job_name.nil?
    headers.merge!(Shell_Path_List: script.shell_path) unless script.shell_path.nil?
    # ignore input_path (not defined in Torque)
    headers.merge!(Output_Path: script.output_path) unless script.output_path.nil?
    headers.merge!(Error_Path: script.error_path) unless script.error_path.nil?
    # If error_path is not specified we join stdout & stderr (as this
    # mimics what the other resource managers do)
    headers.merge!(Join_Path: 'oe') if script.error_path.nil?
    headers.merge!(reservation_id: script.reservation_id) unless script.reservation_id.nil?
    headers.merge!(Priority: script.priority) unless script.priority.nil?
    headers.merge!(Execution_Time: script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")) unless script.start_time.nil?
    headers.merge!(Account_Name: script.accounting_id) unless script.accounting_id.nil?
    headers.merge!(depend: depend.join(','))       unless depend.empty?
    headers.merge!(job_array_request: script.job_array_request) unless script.job_array_request.nil?

    # Set resources
    resources = {}
    resources.merge!(walltime: seconds_to_duration(script.wall_time)) unless script.wall_time.nil?

    # Set environment variables
    envvars = script.job_environment || {}

    # Set native options
    if script.native
      headers.merge!   script.native.fetch(:headers, {})
      resources.merge! script.native.fetch(:resources, {})
      envvars.merge!   script.native.fetch(:envvars, {})
    end

    # Destructively change envvars to shellescape values
    envvars.transform_values! { |v| Shellwords.escape(v) }

    # Submit job
    @pbs.submit_string(script.content, queue: script.queue_name, headers: headers, resources: resources, envvars: envvars)
  else
    # Set qsub arguments
    args = []
    args.concat ["-F", script.args.join(" ")] unless script.args.nil?
    args.concat ["-h"] if script.submit_as_hold
    args.concat ["-r", script.rerunnable ? "y" : "n"] unless script.rerunnable.nil?
    args.concat ["-M", script.email.join(",")] unless script.email.nil?
    args.concat ["-m", mail_points] unless mail_points.empty?
    args.concat ["-N", script.job_name] unless script.job_name.nil?
    args.concat ["-S", script.shell_path] unless script.shell_path.nil?
    # ignore input_path (not defined in Torque)
    args.concat ["-o", script.output_path] unless script.output_path.nil?
    args.concat ["-e", script.error_path] unless script.error_path.nil?
    args.concat ["-W", "x=advres:#{script.reservation_id}"] unless script.reservation_id.nil?
    args.concat ["-q", script.queue_name] unless script.queue_name.nil?
    args.concat ["-p", script.priority] unless script.priority.nil?
    args.concat ["-a", script.start_time.localtime.strftime("%C%y%m%d%H%M.%S")] unless script.start_time.nil?
    args.concat ["-A", script.accounting_id] unless script.accounting_id.nil?
    args.concat ["-W", "depend=#{depend.join(",")}"] unless depend.empty?
    args.concat ["-l", "walltime=#{seconds_to_duration(script.wall_time)}"] unless script.wall_time.nil?
    args.concat ['-t', script.job_array_request] unless script.job_array_request.nil?
    args.concat ['-l', "qos=#{script.qos}"] unless script.qos.nil?
    args.concat ['-l', "gpus=#{script.gpus_per_node}"] unless script.gpus_per_node.nil?

    # Set environment variables
    env = script.job_environment.to_h
    args.concat ["-v", env.keys.join(",")] unless env.empty?
    args.concat ["-V"] if script.copy_environment?

    # If error_path is not specified we join stdout & stderr (as this
    # mimics what the other resource managers do)
    args.concat ["-j", "oe"] if script.error_path.nil?

    # Set native options
    args.concat script.native if script.native

    # Submit job
    @pbs.submit(script.content, args: args, env: env, chdir: script.workdir)
  end
rescue Torque::Batch::Error => e
  raise JobAdapterError, e.message
end