@@ -17,6 +17,7 @@ package main
1717import (
1818 "flag"
1919 "fmt"
20+ "io/fs"
2021 "os"
2122 "path"
2223 "path/filepath"
@@ -32,7 +33,6 @@ import (
3233
3334 "github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
3435 "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
35- "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
3636 dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3737)
3838
@@ -47,12 +47,14 @@ const (
4747 vendorString = "0x8086"
4848
4949 // Device plugin settings.
50- namespace = "gpu.intel.com"
51- deviceType = "i915"
50+ namespace = "gpu.intel.com"
51+ deviceTypeI915 = "i915"
52+ deviceTypeXe = "xe"
53+ deviceTypeDefault = deviceTypeI915
5254
5355 // telemetry resource settings.
54- monitorType = "i915_monitoring "
55- monitorID = "all"
56+ monitorSuffix = "_monitoring "
57+ monitorID = "all"
5658
5759 // Period of device scans.
5860 scanPeriod = 5 * time .Second
@@ -68,6 +70,10 @@ type cliOptions struct {
6870 resourceManagement bool
6971}
7072
73+ type rmWithMultipleDriversErr struct {
74+ error
75+ }
76+
7177type preferredAllocationPolicyFunc func (* pluginapi.ContainerPreferredAllocationRequest ) []string
7278
7379// nonePolicy is used for allocating GPU devices randomly, while trying
@@ -283,7 +289,11 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
283289 if options .resourceManagement {
284290 var err error
285291
286- dp .resMan , err = rm .NewResourceManager (monitorID , namespace + "/" + deviceType )
292+ dp .resMan , err = rm .NewResourceManager (monitorID ,
293+ []string {
294+ namespace + "/" + deviceTypeI915 ,
295+ namespace + "/" + deviceTypeXe ,
296+ })
287297 if err != nil {
288298 klog .Errorf ("Failed to create resource manager: %+v" , err )
289299 return nil
@@ -345,13 +355,20 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
345355func (dp * devicePlugin ) Scan (notifier dpapi.Notifier ) error {
346356 defer dp .scanTicker .Stop ()
347357
348- klog .V (1 ).Infof ("GPU '%s' resource share count = %d" , deviceType , dp .options .sharedDevNum )
358+ klog .V (1 ).Infof ("GPU (%s/%s) resource share count = %d" , deviceTypeI915 , deviceTypeXe , dp .options .sharedDevNum )
349359
350- previousCount := map [string ]int {deviceType : 0 , monitorType : 0 }
360+ previousCount := map [string ]int {
361+ deviceTypeI915 : 0 , deviceTypeXe : 0 ,
362+ deviceTypeXe + monitorSuffix : 0 ,
363+ deviceTypeI915 + monitorSuffix : 0 }
351364
352365 for {
353366 devTree , err := dp .scan ()
354367 if err != nil {
368+ if errors .Is (err , rmWithMultipleDriversErr {}) {
369+ return err
370+ }
371+
355372 klog .Warning ("Failed to scan: " , err )
356373 }
357374
@@ -426,81 +443,116 @@ func (dp *devicePlugin) devSpecForDrmFile(drmFile string) (devSpec pluginapi.Dev
426443 return
427444}
428445
446+ func (dp * devicePlugin ) filterOutInvalidCards (files []fs.DirEntry ) []fs.DirEntry {
447+ filtered := []fs.DirEntry {}
448+
449+ for _ , f := range files {
450+ if ! dp .isCompatibleDevice (f .Name ()) {
451+ continue
452+ }
453+
454+ _ , err := os .Stat (path .Join (dp .sysfsDir , f .Name (), "device/drm" ))
455+ if err != nil {
456+ continue
457+ }
458+
459+ filtered = append (filtered , f )
460+ }
461+
462+ return filtered
463+ }
464+
465+ func (dp * devicePlugin ) createDeviceSpecsFromDrmFiles (cardPath string ) []pluginapi.DeviceSpec {
466+ specs := []pluginapi.DeviceSpec {}
467+
468+ drmFiles , _ := os .ReadDir (path .Join (cardPath , "device/drm" ))
469+
470+ for _ , drmFile := range drmFiles {
471+ devSpec , devPath , devSpecErr := dp .devSpecForDrmFile (drmFile .Name ())
472+ if devSpecErr != nil {
473+ continue
474+ }
475+
476+ klog .V (4 ).Infof ("Adding %s to GPU %s" , devPath , filepath .Base (cardPath ))
477+
478+ specs = append (specs , devSpec )
479+ }
480+
481+ return specs
482+ }
483+
429484func (dp * devicePlugin ) scan () (dpapi.DeviceTree , error ) {
430485 files , err := os .ReadDir (dp .sysfsDir )
431486 if err != nil {
432487 return nil , errors .Wrap (err , "Can't read sysfs folder" )
433488 }
434489
435- var monitor [] pluginapi.DeviceSpec
490+ monitor := make ( map [ string ][] pluginapi.DeviceSpec , 0 )
436491
437492 devTree := dpapi .NewDeviceTree ()
438493 rmDevInfos := rm .NewDeviceInfoMap ()
439- tileCounts := [] uint64 {}
494+ devProps := newDeviceProperties ()
440495
441- for _ , f := range files {
442- var nodes []pluginapi.DeviceSpec
496+ for _ , f := range dp .filterOutInvalidCards (files ) {
497+ name := f .Name ()
498+ cardPath := path .Join (dp .sysfsDir , name )
443499
444- if ! dp .isCompatibleDevice (f .Name ()) {
500+ devProps .fetch (cardPath )
501+
502+ if devProps .isPfWithVfs {
445503 continue
446504 }
447505
448- cardPath := path . Join ( dp .sysfsDir , f . Name () )
506+ devSpecs := dp .createDeviceSpecsFromDrmFiles ( cardPath )
449507
450- drmFiles , err := os .ReadDir (path .Join (cardPath , "device/drm" ))
451- if err != nil {
452- return nil , errors .Wrap (err , "Can't read device folder" )
508+ if len (devSpecs ) == 0 {
509+ continue
453510 }
454511
455- isPFwithVFs := pluginutils .IsSriovPFwithVFs (path .Join (dp .sysfsDir , f .Name ()))
456- tileCounts = append (tileCounts , labeler .GetTileCount (dp .sysfsDir , f .Name ()))
457-
458- for _ , drmFile := range drmFiles {
459- devSpec , devPath , devSpecErr := dp .devSpecForDrmFile (drmFile .Name ())
460- if devSpecErr != nil {
461- continue
462- }
463-
464- if ! isPFwithVFs {
465- klog .V (4 ).Infof ("Adding %s to GPU %s" , devPath , f .Name ())
512+ mounts := []pluginapi.Mount {}
513+ if dp .bypathFound {
514+ mounts = dp .bypathMountsForPci (cardPath , name , dp .bypathDir )
515+ }
466516
467- nodes = append (nodes , devSpec )
468- }
517+ deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , devSpecs , mounts , nil , nil )
469518
470- if dp .options .enableMonitoring {
471- klog .V (4 ).Infof ("Adding %s to GPU %s/%s" , devPath , monitorType , monitorID )
519+ for i := 0 ; i < dp .options .sharedDevNum ; i ++ {
520+ devID := fmt .Sprintf ("%s-%d" , name , i )
521+ devTree .AddDevice (devProps .driver (), devID , deviceInfo )
472522
473- monitor = append (monitor , devSpec )
474- }
523+ rmDevInfos [devID ] = rm .NewDeviceInfo (devSpecs , mounts , nil )
475524 }
476525
477- if len (nodes ) > 0 {
478- mounts := []pluginapi.Mount {}
479- if dp .bypathFound {
480- mounts = dp .bypathMountsForPci (cardPath , f .Name (), dp .bypathDir )
481- }
482-
483- deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , nodes , mounts , nil , nil )
484-
485- for i := 0 ; i < dp .options .sharedDevNum ; i ++ {
486- devID := fmt .Sprintf ("%s-%d" , f .Name (), i )
487- // Currently only one device type (i915) is supported.
488- // TODO: check model ID to differentiate device models.
489- devTree .AddDevice (deviceType , devID , deviceInfo )
526+ if dp .options .enableMonitoring {
527+ res := devProps .monitorResource ()
528+ klog .V (4 ).Infof ("For %s/%s, adding nodes: %+v" , res , monitorID , devSpecs )
490529
491- rmDevInfos [devID ] = rm .NewDeviceInfo (nodes , mounts , nil )
492- }
530+ monitor [res ] = append (monitor [res ], devSpecs ... )
493531 }
494532 }
495- // all Intel GPUs are under single monitoring resource
533+
534+ // all Intel GPUs are under single monitoring resource per KMD
496535 if len (monitor ) > 0 {
497- deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , monitor , nil , nil , nil )
498- devTree .AddDevice (monitorType , monitorID , deviceInfo )
536+ for resourceName , devices := range monitor {
537+ deviceInfo := dpapi .NewDeviceInfo (pluginapi .Healthy , devices , nil , nil , nil )
538+ devTree .AddDevice (resourceName , monitorID , deviceInfo )
539+ }
499540 }
500541
501542 if dp .resMan != nil {
502- dp .resMan .SetDevInfos (rmDevInfos )
503- dp .resMan .SetTileCountPerCard (tileCounts )
543+ if devProps .drmDriverCount () <= 1 {
544+ dp .resMan .SetDevInfos (rmDevInfos )
545+
546+ if tileCount , err := devProps .maxTileCount (); err == nil {
547+ dp .resMan .SetTileCountPerCard (tileCount )
548+ }
549+ } else {
550+ klog .Warning ("Plugin with RM doesn't support multiple DRM drivers:" , devProps .drmDrivers )
551+
552+ err := rmWithMultipleDriversErr {}
553+
554+ return nil , err
555+ }
504556 }
505557
506558 return devTree , nil
@@ -521,7 +573,7 @@ func main() {
521573 )
522574
523575 flag .StringVar (& prefix , "prefix" , "" , "Prefix for devfs & sysfs paths" )
524- flag .BoolVar (& opts .enableMonitoring , "enable-monitoring" , false , "whether to enable 'i915_monitoring ' (= all GPUs) resource" )
576+ flag .BoolVar (& opts .enableMonitoring , "enable-monitoring" , false , "whether to enable '*_monitoring ' (= all GPUs) resource" )
525577 flag .BoolVar (& opts .resourceManagement , "resource-manager" , false , "fractional GPU resource management" )
526578 flag .IntVar (& opts .sharedDevNum , "shared-dev-num" , 1 , "number of containers sharing the same GPU device" )
527579 flag .StringVar (& opts .preferredAllocationPolicy , "allocation-policy" , "none" , "modes of allocating GPU devices: balanced, packed and none" )
0 commit comments