Module: HPC::SLURM
Class Method Summary
collapse
batch_dir_for_id, batch_options, batch_system_variables, cleanup_environment, coda, exec_cmd, execute, follow_job, header, hold_dependencies, job_queued, job_template, load_conda, load_modules, meta_data, prepare_environment, prepare_submision, rbbt_job_exec_cmd, run_job, sync_environment, wait_for_job
accumulate_rules, add_batch_deps, add_config_keys, add_rules_and_consolidate, chain_batches, check_chains, get_chains, get_job_dependencies, get_recursive_job_dependencies, job_batches, job_chains, job_dependencies, job_rules, job_rules, job_workload, merge_rules, orchestrate_job, orchestration_rules, parse_chains, pb, piggyback, prepare_for_execution, task_specific_rules, workflow_rules, workload
Class Method Details
.batch_system ⇒ Object
9
10
11
|
# File 'lib/rbbt/hpc/slurm.rb', line 9
def self.batch_system
"SLURM"
end
|
.batch_system_variables ⇒ Object
13
14
15
16
17
18
19
20
21
22
23
24
25
|
# File 'lib/rbbt/hpc/slurm.rb', line 13
def self.batch_system_variables
<<-EOF
let TOTAL_PROCESORS="$(cat /proc/cpuinfo|grep ^processor |wc -l)"
let MAX_MEMORY_DEFAULT="$(grep MemTotal /proc/meminfo|grep -o "[[:digit:]]*") / ( (1024 * $TOTAL_PROCESORS) / $SLURM_CPUS_PER_TASK )"
MAX_MEMORY="$MAX_MEMORY_DEFAULT"
[ ! -z $SLURM_MEM_PER_CPU ] && let MAX_MEMORY="$SLURM_MEM_PER_CPU * $SLURM_CPUS_PER_TASK"
[ ! -z $SLURM_MEM_PER_NODE ] && MAX_MEMORY="$SLURM_MEM_PER_NODE"
export MAX_MEMORY_DEFAULT
export MAX_MEMORY
export BATCH_JOB_ID=$SLURM_JOB_ID
export BATCH_SYSTEM=#{batch_system}
EOF
end
|
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
# File 'lib/rbbt/hpc/slurm.rb', line 27
def self.(options = {})
options = options.dup
queue = Misc.process_options options, :queue
account = Misc.process_options options, :account
partition = Misc.process_options options, :partition
task_cpus = Misc.process_options options, :task_cpus
time = Misc.process_options options, :time
nodes = Misc.process_options options, :nodes
workdir = Misc.process_options options, :workdir
exclusive = Misc.process_options options, :exclusive
highmem = Misc.process_options options, :highmem
licenses = Misc.process_options options, :licenses
constraint = Misc.process_options options, :constraint
gres = Misc.process_options options, :gres
constraint = [constraint, "highmem"].compact * "&" if highmem
mem = Misc.process_options options, :mem
mem_per_cpu = Misc.process_options options, :mem_per_cpu
batch_dir = Misc.process_options options, :batch_dir
batch_name = Misc.process_options options, :batch_name
fout = File.join(batch_dir, 'std.out')
ferr = File.join(batch_dir, 'std.err')
time = Misc.format_seconds Misc.timespan(time) unless time.include? ":"
sbatch_params = {"job-name" => batch_name,
"qos" => queue,
"account" => account,
"partition" => partition,
"output" => fout,
"error" => ferr,
"cpus-per-task" => task_cpus,
"nodes" => nodes,
"time" => time,
"constraint" => constraint,
"exclusive" => exclusive,
"licenses" => licenses,
"gres" => gres,
"mem" => mem,
"mem-per-cpu" => mem_per_cpu,
}
=<<-EOF
#!/bin/bash
EOF
sbatch_params.each do |name,value|
next if value.nil? || value == ""
if TrueClass === value
<< "#SBATCH --#{name}" << "\n"
elsif Array === value
value.each do |v|
<< "#SBATCH --#{name}=\"#{v}\"" << "\n"
end
else
<< "#SBATCH --#{name}=\"#{value}\"" << "\n"
end
end
end
|
.job_status(job = nil) ⇒ Object
148
149
150
151
152
153
154
155
156
157
158
|
# File 'lib/rbbt/hpc/slurm.rb', line 148
def self.job_status(job = nil)
if job.nil?
CMD.cmd("squeue").read
else
begin
CMD.cmd("squeue --job #{job}").read
rescue
""
end
end
end
|
.run_template(batch_dir, dry_run) ⇒ Object
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
# File 'lib/rbbt/hpc/slurm.rb', line 94
def self.run_template(batch_dir, dry_run)
fout = File.join(batch_dir, 'std.out')
ferr = File.join(batch_dir, 'std.err')
fjob = File.join(batch_dir, 'job.id')
fdep = File.join(batch_dir, 'dependencies.list')
fcfdep = File.join(batch_dir, 'canfail_dependencies.list')
fexit = File.join(batch_dir, 'exit.status')
fsync = File.join(batch_dir, 'sync.log')
fcmd = File.join(batch_dir, 'command.batch')
return if Open.exists?(fexit)
Log.info "Issuing SLURM file: #{fcmd}"
Log.debug Open.read(fcmd)
if File.exist?(fjob)
job = Open.read(fjob).to_i
else
dependencies = Open.read(fdep).split("\n") if File.exist? fdep
canfail_dependencies = Open.read(fcfdep).split("\n") if File.exist? fcfdep
normal_dep_str = dependencies && dependencies.any? ? "afterok:" + dependencies * ":" : nil
canfail_dep_str = canfail_dependencies && canfail_dependencies.any? ? "afterany:" + canfail_dependencies * ":" : nil
if normal_dep_str.nil? && canfail_dep_str.nil?
dep_str = ""
else
dep_str = '--dependency=' + [normal_dep_str, canfail_dep_str].compact * ","
end
cmd = "sbatch #{dep_str} '#{fcmd}'"
if File.exist?(fout)
return
elsif dry_run
STDERR.puts Log.color(:magenta, "To execute run: ") + Log.color(:blue, "sbatch '#{fcmd}'")
STDERR.puts Log.color(:magenta, "To monitor progress run (needs local rbbt): ") + Log.color(:blue, "rbbt slurm tail '#{batch_dir}'")
raise HPC::BATCH_DRY_RUN, batch_dir
else
Open.rm fsync
Open.rm fexit
Open.rm fout
Open.rm ferr
job = CMD.cmd(cmd).read.scan(/\d+/).first.to_i
Log.debug "SBATCH job id: #{job}"
Open.write(fjob, job.to_s)
job
end
end
end
|