@@ -70,7 +70,9 @@ type cliOptions struct {
7070 allowIDs string
7171 denyIDs string
7272 sharedDevNum int
73- temperatureLimit int
73+ globalTempLimit int
74+ memoryTempLimit int
75+ gpuTempLimit int
7476 enableMonitoring bool
7577 wslScan bool
7678 healthManagement bool
@@ -402,13 +404,13 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
402404 return health
403405 }
404406
405- limit := float64 (dp .options .temperatureLimit )
406-
407407 // Temperatures for different areas
408- klog .V (4 ).Infof ("Temperatures: Memory=%.1fC , GPU=%.1fC , Global=%.1fC " ,
408+ klog .V (4 ).Infof ("Temperatures: Memory=%dC , GPU=%dC , Global=%dC " ,
409409 deviceTemps .Memory , deviceTemps .GPU , deviceTemps .Global )
410410
411- if deviceTemps .GPU > limit || deviceTemps .Global > limit || deviceTemps .Memory > limit {
411+ if deviceTemps .GPU > dp .options .gpuTempLimit ||
412+ deviceTemps .Global > dp .options .globalTempLimit ||
413+ deviceTemps .Memory > dp .options .memoryTempLimit {
412414 health = pluginapi .Unhealthy
413415 }
414416
@@ -784,7 +786,9 @@ func main() {
784786 flag .BoolVar (& opts .healthManagement , "health-management" , false , "enable GPU health management" )
785787 flag .BoolVar (& opts .wslScan , "wsl" , false , "scan for / use WSL devices" )
786788 flag .IntVar (& opts .sharedDevNum , "shared-dev-num" , 1 , "number of containers sharing the same GPU device" )
787- flag .IntVar (& opts .temperatureLimit , "temp-limit" , 100 , "temperature limit at which device is marked unhealthy" )
789+ flag .IntVar (& opts .globalTempLimit , "temp-limit" , 100 , "Global temperature limit at which device is marked unhealthy" )
790+ flag .IntVar (& opts .gpuTempLimit , "gpu-temp-limit" , 100 , "GPU temperature limit at which device is marked unhealthy" )
791+ flag .IntVar (& opts .memoryTempLimit , "memory-temp-limit" , 100 , "Memory temperature limit at which device is marked unhealthy" )
788792 flag .StringVar (& opts .preferredAllocationPolicy , "allocation-policy" , "none" , "modes of allocating GPU devices: balanced, packed and none" )
789793 flag .StringVar (& opts .allowIDs , "allow-ids" , "" , "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)" )
790794 flag .StringVar (& opts .denyIDs , "deny-ids" , "" , "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)" )
0 commit comments