Class: ComputeUnit::NvidiaGpu

Inherits:
Gpu show all
Defined in:
lib/compute_unit/gpus/nvidia_gpu.rb

Constant Summary collapse

VENDOR_ID =
'10de'
MAKE =
'Nvidia'
SUBTYPE =
'nvidia'
NVIDIA_SMI =
'/usr/bin/nvidia-smi'
NVIDIA_PROC_PATH =
ENV['NVIDIA_PROC_PATH'] || File.join(ComputeUnit::Device::PROC_PATH, 'driver', 'nvidia', 'gpus')

Constants inherited from Gpu

Gpu::DEVICE_CLASS, Gpu::DEVICE_CLASS_NAME

Constants inherited from ComputeBase

ComputeBase::CACHE_TIMEOUT

Constants inherited from Device

Device::PROC_PATH, Device::SYSFS_DEVICES_PATH

Instance Attribute Summary

Attributes inherited from Gpu

#bios, #name, #pci_loc, #use_opencl

Attributes inherited from ComputeBase

#compute_type, #index, #power_offset, #serial, #timestamp, #type, #uuid

Attributes inherited from Device

#device_class_id, #device_id, #device_path, #device_vendor_id, #make, #model, #subsystem_device_id, #subsystem_vendor_id, #vendor

Class Method Summary collapse

Instance Method Summary collapse

Methods inherited from Gpu

#asic_temp, attached_processes, #compute_type, #configured_core_voltage, #core_voltage, #fan_limit, #fan_max_limit, #fan_min_limit, found_devices, #hardware_info, #mem_info, #mem_temp, #memory_volt, #opencl_board_name, opencl_cache, #opencl_device, opencl_devices, opencl_devices_from_cache, opencl_devices_from_platform, #opencl_name, #opencl_units, #status, #status_info, #to_h, #vddgfx, #voltage_table

Methods inherited from ComputeBase

#attached_processes, compute_classes, #device_class_name, #expired_metadata?, #top_processes

Methods included from Logger

color, log_file, log_level, logger, #logger

Methods inherited from Device

#base_hwmon_path, device, device_class, device_lookup, device_vendor, #expired_metadata?, #generic_model, #hwmon_path, #lock_rom, logger, manual_device_database, manual_device_lookup, manual_vendor_lookup, manual_vendors, name_map, name_translation, pci_database, #read_file, #read_hwmon_data, #read_kernel_setting, read_kernel_setting, #rom_data, #rom_path, subsystem_device, subsystem_device_lookup, subsystem_vendor, subsystem_vendor_lookup, #sysfs_model_name, system_checksum, #to_h, #to_json, #unlock_rom, vendor_lookup, #write_hwmon_data, #write_kernel_setting, write_kernel_setting

Methods included from Utils

check_for_root, #root?, root?

Constructor Details

#initialize(device_path, opts = {}) ⇒ NvidiaGpu

Returns a new instance of NvidiaGpu.



13
14
15
16
17
18
19
20
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 13

def initialize(device_path, opts = {})
  data = self.class.read_information_file(device_path).merge(opts)
  data[:pci_loc] = device_path
  data[:busid] = data[:bus_location]
  data[:bios] = data[:video_bios].upcase if data[:video_bios]
  data[:uuid] = data[:gpu_uuid]
  super(device_path, data)
end

Class Method Details

.blank_dataObject



41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 41

def self.blank_data
  @blank_data ||= {
    'memory.used [MiB]' => '0',
    'memory.free [MiB]' => '0',
    'memory.total [MiB]' => '0',
    'utilization.gpu [%]' => '0',
    'temperature.gpu' => '0',
    'power.draw [W]' => '0',
    'power.limit [W]' => '0',
    'power.max_limit [W]' => '0',
    'pstate' => 7,
    'fan.speed [%]' => '0',
    'clocks.current.memory [MHz]' => '0',
    'clocks.current.sm [MHz]' => '0'
  }
end

.create_from_path(device_path, index, use_opencl = false) ⇒ Object

Parameters:

  • device_path (String)
    • the device path of the device

  • index (Integer)
    • the index of the device relative to other devices of the same class ie. GPU0



174
175
176
177
178
179
180
181
182
183
184
185
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 174

def self.create_from_path(device_path, index, use_opencl = false)
  opts = {
    device_class_id: device_class(device_path),
    device_id: device(device_path),
    device_vendor_id: device_vendor(device_path),
    subsystem_vendor_id: subsystem_vendor(device_path),
    subsystem_device_id: subsystem_device(device_path),
    use_opencl: use_opencl,
    index: index
  }
  new(device_path, opts)
end

.devicesArray

Returns - returns a list of device paths of all devices specific to the vendor id.

Returns:

  • (Array)
    • returns a list of device paths of all devices specific to the vendor id



168
169
170
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 168

def self.devices
  ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
end

.find_all(use_opencl = false) ⇒ Array

Returns - returns and array of gpu instances of NVIDIA type only.

Returns:

  • (Array)
    • returns and array of gpu instances of NVIDIA type only



188
189
190
191
192
193
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 188

def self.find_all(use_opencl = false)
  devices.map.with_index do |device_path, _index|
    found_index = ComputeUnit::Gpu.found_devices.index(device_path)
    create_from_path(device_path, found_index, use_opencl)
  end
end

.read_information_file(device_path) ⇒ Hash

GTX 1070”,

:irq=>"130",
:gpu_uuid=>"GPU-0116fb5c-66f4-1cba-c216-97f4600a8152",
:video_bios=>"86.04.50.40.4a",
:bus_type=>"PCIe",
:dma_size=>"47 bits",
:dma_mask=>"0x7fffffffffff",
:bus_location=>"0000:0d:00.0",
:device_minor=>"7"

Returns:

  • (Hash)
    • hash of card info given by the kernel



211
212
213
214
215
216
217
218
219
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 211

def self.read_information_file(device_path)
  device_name = File.basename(device_path)
  information_file = File.join(NVIDIA_PROC_PATH, device_name, 'information')

  File.open(information_file, 'r') do |file|
    content = file.read
    content.scan(/\n?([\w\s]*):\s+(.*)/).map { |key, value| [key.downcase.tr(' ', '_').to_sym, value] }.to_h
  end
end

Instance Method Details

#core_clockInteger

Returns the current core clock speed.

Returns:

  • (Integer)

    the current core clock speed



94
95
96
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 94

def core_clock
  meta['clocks.current.sm [MHz]'].to_i
end

#fanInteger

Returns the fan speed.

Returns:

  • (Integer)

    the fan speed



99
100
101
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 99

def fan
  meta['fan.speed [%]'].to_i
end

#information_fileObject



160
161
162
163
164
165
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 160

def information_file
  @information_file ||= begin
    device_name = File.basename(device_path)
    File.join(NVIDIA_PROC_PATH, device_name, 'information')
  end
end

#memory_clockInteger

Returns the current memory clock speed.

Returns:

  • (Integer)

    the current memory clock speed



89
90
91
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 89

def memory_clock
  meta['clocks.current.memory [MHz]'].to_i
end

#memory_freeObject



152
153
154
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 152

def memory_free
  meta['memory.free [MiB]']
end

#memory_totalObject



144
145
146
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 144

def memory_total
  meta['memory.total [MiB]']
end

#memory_usedObject



148
149
150
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 148

def memory_used
  meta['memory.used [MiB]']
end

#metaHash

return cached data or fetch new data

Returns:

  • (Hash)

    the metadata from the nvidia-smi tool



32
33
34
35
36
37
38
39
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 32

def meta
  if expired_metadata?
    logger.debug("Expired Nvidia Data for #{uuid} ")
    @meta = 
  else
    @meta ||= 
  end
end

#metadataHash

Note:

data returned from nvidia-smi

“memory.used [MiB]”: “2578 MiB”,

"memory.free [MiB]": "5534 MiB",
"memory.total [MiB]": "8112 MiB",
"utilization.gpu [%]": "100",
"temperature.gpu": "53",
"power.draw [W]": "129.21",
"power.limit [W]": "130.00",
"power.max_limit [W]": "217.00",
"pstate": 2,
"fan.speed [%]": "75"

Returns:

  • (Hash)

    “name”: “GeForce GTX 1070 Ti”, “vbios_version”: “86.04.85.00.63”, “uuid”: “GPU-a583cb04-f9b5-68f3-50b9-2b4ba1c7d14e”,



73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 73

def 
  logger.debug("Calling #{NVIDIA_SMI}")
  data = `#{NVIDIA_SMI} --query-gpu=gpu_name,vbios_version,uuid,memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu,power.draw,power.limit,power.max_limit,fan.speed,pstate,clocks.current.memory,clocks.current.sm -i #{index} --format=csv,nounits 2>&1`
  unless $CHILD_STATUS.success?
    # error code 15
    logger.error(data.delete("\n"))
    return self.class.blank_data
  end
  cards = if data
            CSV.parse(data, headers: true, header_converters: ->(f) { f.strip },
                            converters: ->(f) { f ? f.strip : nil }).map(&:to_h)
          end
  cards.first
end

#powerFloat

Returns the power being used by the gpu.

Returns:

  • (Float)

    the power being used by the gpu



104
105
106
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 104

def power
  meta['power.draw [W]'].strip.to_f + power_offset
end

#power_limitObject



116
117
118
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 116

def power_limit
  meta['power.limit [W]'].strip.to_f
end

#power_limit=(value) ⇒ Object

Parameters:

  • value (Numeric)

    power in watts to set the gpu limit to

Raises:

  • (ArgumentError)


125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 125

def power_limit=(value)
  # in the correct format and above 10 watts
  raise ArgumentError.new("Power value #{value.to_i} cannot exceed #{power_max_limit}") unless value.to_i.between?(1, power_max_limit.to_i)

  output = `#{NVIDIA_SMI} -i #{index} -pl #{value}`
  if $CHILD_STATUS.success?
    logger.info("GPU#{index} power set to #{value} Watts")
  else
    logger.warn("GPU#{index} failed setting power to #{value}\n#{output}")
  end
  value.to_i
end

#power_max_limitObject



120
121
122
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 120

def power_max_limit
  meta['power.max_limit [W]'].strip.to_f
end

#pstateObject



112
113
114
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 112

def pstate
  meta['pstate'].to_i
end

#reset_metadataObject



26
27
28
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 26

def 
  @meta = nil
end

#set_fan_limit(_value, _type = 'current') ⇒ Numeric

Returns - original passed in value after being set.

Parameters:

  • value (Numeric)
    • the fan limit that should be applied to the gpu as a percentage

Returns:

  • (Numeric)
    • original passed in value after being set

Raises:

  • (NotImplementedError)


140
141
142
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 140

def set_fan_limit(_value, _type = 'current')
  raise NotImplementedError.new('Not implemented for Nvidia')
end

#set_mem_clock_and_vddc(_mem_clock, _mem_volt) ⇒ Object



195
196
197
198
199
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 195

def set_mem_clock_and_vddc(_mem_clock, _mem_volt)
  return unless experimental_on?

  logger.warn('Feature not enabled for nvidia')
end

#subtypeObject



22
23
24
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 22

def subtype
  SUBTYPE
end

#tempObject



108
109
110
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 108

def temp
  meta['temperature.gpu'].to_i
end

#utilizationObject



156
157
158
# File 'lib/compute_unit/gpus/nvidia_gpu.rb', line 156

def utilization
  meta['utilization.gpu [%]'].sub(/%/, '').to_i
end