Skip to content

Commit e600fe9

Browse files
committed
gpu: add support for the upcoming xe-driver
Plugin can support both i915 and xe drivers dynamically. But having both drivers on same node with RM is not possible. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
1 parent d5cb53a commit e600fe9

File tree

9 files changed

+672
-158
lines changed

9 files changed

+672
-158
lines changed

.github/workflows/lib-e2e.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ jobs:
2525
- name: e2e-gpu
2626
runner: gpu
2727
images: intel-gpu-plugin intel-gpu-initcontainer
28+
targetJob: e2e-gpu SKIP=Resource:xe
2829
- name: e2e-iaa-spr
2930
targetjob: e2e-iaa
3031
runner: simics-spr

cmd/gpu_plugin/device_props.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
// Copyright 2024 Intel Corporation. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"slices"
19+
20+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
21+
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
22+
"k8s.io/klog/v2"
23+
)
24+
25+
type DeviceProperties struct {
26+
currentDriver string
27+
drmDrivers map[string]bool
28+
tileCounts []uint64
29+
isPfWithVfs bool
30+
}
31+
32+
type invalidTileCountErr struct {
33+
error
34+
}
35+
36+
func newDeviceProperties() *DeviceProperties {
37+
return &DeviceProperties{
38+
drmDrivers: make(map[string]bool),
39+
}
40+
}
41+
42+
func (d *DeviceProperties) fetch(cardPath string) {
43+
d.isPfWithVfs = pluginutils.IsSriovPFwithVFs(cardPath)
44+
45+
d.tileCounts = append(d.tileCounts, labeler.GetTileCount(cardPath))
46+
47+
driverName, err := pluginutils.ReadDeviceDriver(cardPath)
48+
if err != nil {
49+
klog.Warningf("card (%s) doesn't have driver, using default: %s", cardPath, deviceTypeDefault)
50+
51+
driverName = deviceTypeDefault
52+
}
53+
54+
d.currentDriver = driverName
55+
d.drmDrivers[d.currentDriver] = true
56+
}
57+
58+
func (d *DeviceProperties) drmDriverCount() int {
59+
return len(d.drmDrivers)
60+
}
61+
62+
func (d *DeviceProperties) driver() string {
63+
return d.currentDriver
64+
}
65+
66+
func (d *DeviceProperties) monitorResource() string {
67+
return d.currentDriver + monitorSuffix
68+
}
69+
70+
func (d *DeviceProperties) maxTileCount() (uint64, error) {
71+
if len(d.tileCounts) == 0 {
72+
return 0, invalidTileCountErr{}
73+
}
74+
75+
minCount := slices.Min(d.tileCounts)
76+
maxCount := slices.Max(d.tileCounts)
77+
78+
if minCount != maxCount {
79+
klog.Warningf("Node's GPUs are heterogenous (min: %d, max: %d tiles)", minCount, maxCount)
80+
81+
return 0, invalidTileCountErr{}
82+
}
83+
84+
return maxCount, nil
85+
}

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 107 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ package main
1717
import (
1818
"flag"
1919
"fmt"
20+
"io/fs"
2021
"os"
2122
"path"
2223
"path/filepath"
@@ -32,7 +33,6 @@ import (
3233

3334
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/gpu_plugin/rm"
3435
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/labeler"
35-
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
3636
dpapi "github.com/intel/intel-device-plugins-for-kubernetes/pkg/deviceplugin"
3737
)
3838

@@ -47,12 +47,14 @@ const (
4747
vendorString = "0x8086"
4848

4949
// Device plugin settings.
50-
namespace = "gpu.intel.com"
51-
deviceType = "i915"
50+
namespace = "gpu.intel.com"
51+
deviceTypeI915 = "i915"
52+
deviceTypeXe = "xe"
53+
deviceTypeDefault = deviceTypeI915
5254

5355
// telemetry resource settings.
54-
monitorType = "i915_monitoring"
55-
monitorID = "all"
56+
monitorSuffix = "_monitoring"
57+
monitorID = "all"
5658

5759
// Period of device scans.
5860
scanPeriod = 5 * time.Second
@@ -68,6 +70,10 @@ type cliOptions struct {
6870
resourceManagement bool
6971
}
7072

73+
type rmWithMultipleDriversErr struct {
74+
error
75+
}
76+
7177
type preferredAllocationPolicyFunc func(*pluginapi.ContainerPreferredAllocationRequest) []string
7278

7379
// nonePolicy is used for allocating GPU devices randomly, while trying
@@ -283,7 +289,11 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
283289
if options.resourceManagement {
284290
var err error
285291

286-
dp.resMan, err = rm.NewResourceManager(monitorID, namespace+"/"+deviceType)
292+
dp.resMan, err = rm.NewResourceManager(monitorID,
293+
[]string{
294+
namespace + "/" + deviceTypeI915,
295+
namespace + "/" + deviceTypeXe,
296+
})
287297
if err != nil {
288298
klog.Errorf("Failed to create resource manager: %+v", err)
289299
return nil
@@ -345,13 +355,20 @@ func (dp *devicePlugin) GetPreferredAllocation(rqt *pluginapi.PreferredAllocatio
345355
func (dp *devicePlugin) Scan(notifier dpapi.Notifier) error {
346356
defer dp.scanTicker.Stop()
347357

348-
klog.V(1).Infof("GPU '%s' resource share count = %d", deviceType, dp.options.sharedDevNum)
358+
klog.V(1).Infof("GPU (%s/%s) resource share count = %d", deviceTypeI915, deviceTypeXe, dp.options.sharedDevNum)
349359

350-
previousCount := map[string]int{deviceType: 0, monitorType: 0}
360+
previousCount := map[string]int{
361+
deviceTypeI915: 0, deviceTypeXe: 0,
362+
deviceTypeXe + monitorSuffix: 0,
363+
deviceTypeI915 + monitorSuffix: 0}
351364

352365
for {
353366
devTree, err := dp.scan()
354367
if err != nil {
368+
if errors.Is(err, rmWithMultipleDriversErr{}) {
369+
return err
370+
}
371+
355372
klog.Warning("Failed to scan: ", err)
356373
}
357374

@@ -426,81 +443,116 @@ func (dp *devicePlugin) devSpecForDrmFile(drmFile string) (devSpec pluginapi.Dev
426443
return
427444
}
428445

446+
func (dp *devicePlugin) filterOutInvalidCards(files []fs.DirEntry) []fs.DirEntry {
447+
filtered := []fs.DirEntry{}
448+
449+
for _, f := range files {
450+
if !dp.isCompatibleDevice(f.Name()) {
451+
continue
452+
}
453+
454+
_, err := os.Stat(path.Join(dp.sysfsDir, f.Name(), "device/drm"))
455+
if err != nil {
456+
continue
457+
}
458+
459+
filtered = append(filtered, f)
460+
}
461+
462+
return filtered
463+
}
464+
465+
func (dp *devicePlugin) createDeviceSpecsFromDrmFiles(cardPath string) []pluginapi.DeviceSpec {
466+
specs := []pluginapi.DeviceSpec{}
467+
468+
drmFiles, _ := os.ReadDir(path.Join(cardPath, "device/drm"))
469+
470+
for _, drmFile := range drmFiles {
471+
devSpec, devPath, devSpecErr := dp.devSpecForDrmFile(drmFile.Name())
472+
if devSpecErr != nil {
473+
continue
474+
}
475+
476+
klog.V(4).Infof("Adding %s to GPU %s", devPath, filepath.Base(cardPath))
477+
478+
specs = append(specs, devSpec)
479+
}
480+
481+
return specs
482+
}
483+
429484
func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
430485
files, err := os.ReadDir(dp.sysfsDir)
431486
if err != nil {
432487
return nil, errors.Wrap(err, "Can't read sysfs folder")
433488
}
434489

435-
var monitor []pluginapi.DeviceSpec
490+
monitor := make(map[string][]pluginapi.DeviceSpec, 0)
436491

437492
devTree := dpapi.NewDeviceTree()
438493
rmDevInfos := rm.NewDeviceInfoMap()
439-
tileCounts := []uint64{}
494+
devProps := newDeviceProperties()
440495

441-
for _, f := range files {
442-
var nodes []pluginapi.DeviceSpec
496+
for _, f := range dp.filterOutInvalidCards(files) {
497+
name := f.Name()
498+
cardPath := path.Join(dp.sysfsDir, name)
443499

444-
if !dp.isCompatibleDevice(f.Name()) {
500+
devProps.fetch(cardPath)
501+
502+
if devProps.isPfWithVfs {
445503
continue
446504
}
447505

448-
cardPath := path.Join(dp.sysfsDir, f.Name())
506+
devSpecs := dp.createDeviceSpecsFromDrmFiles(cardPath)
449507

450-
drmFiles, err := os.ReadDir(path.Join(cardPath, "device/drm"))
451-
if err != nil {
452-
return nil, errors.Wrap(err, "Can't read device folder")
508+
if len(devSpecs) == 0 {
509+
continue
453510
}
454511

455-
isPFwithVFs := pluginutils.IsSriovPFwithVFs(path.Join(dp.sysfsDir, f.Name()))
456-
tileCounts = append(tileCounts, labeler.GetTileCount(dp.sysfsDir, f.Name()))
457-
458-
for _, drmFile := range drmFiles {
459-
devSpec, devPath, devSpecErr := dp.devSpecForDrmFile(drmFile.Name())
460-
if devSpecErr != nil {
461-
continue
462-
}
463-
464-
if !isPFwithVFs {
465-
klog.V(4).Infof("Adding %s to GPU %s", devPath, f.Name())
512+
mounts := []pluginapi.Mount{}
513+
if dp.bypathFound {
514+
mounts = dp.bypathMountsForPci(cardPath, name, dp.bypathDir)
515+
}
466516

467-
nodes = append(nodes, devSpec)
468-
}
517+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devSpecs, mounts, nil, nil)
469518

470-
if dp.options.enableMonitoring {
471-
klog.V(4).Infof("Adding %s to GPU %s/%s", devPath, monitorType, monitorID)
519+
for i := 0; i < dp.options.sharedDevNum; i++ {
520+
devID := fmt.Sprintf("%s-%d", name, i)
521+
devTree.AddDevice(devProps.driver(), devID, deviceInfo)
472522

473-
monitor = append(monitor, devSpec)
474-
}
523+
rmDevInfos[devID] = rm.NewDeviceInfo(devSpecs, mounts, nil)
475524
}
476525

477-
if len(nodes) > 0 {
478-
mounts := []pluginapi.Mount{}
479-
if dp.bypathFound {
480-
mounts = dp.bypathMountsForPci(cardPath, f.Name(), dp.bypathDir)
481-
}
482-
483-
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, nodes, mounts, nil, nil)
484-
485-
for i := 0; i < dp.options.sharedDevNum; i++ {
486-
devID := fmt.Sprintf("%s-%d", f.Name(), i)
487-
// Currently only one device type (i915) is supported.
488-
// TODO: check model ID to differentiate device models.
489-
devTree.AddDevice(deviceType, devID, deviceInfo)
526+
if dp.options.enableMonitoring {
527+
res := devProps.monitorResource()
528+
klog.V(4).Infof("For %s/%s, adding nodes: %+v", res, monitorID, devSpecs)
490529

491-
rmDevInfos[devID] = rm.NewDeviceInfo(nodes, mounts, nil)
492-
}
530+
monitor[res] = append(monitor[res], devSpecs...)
493531
}
494532
}
495-
// all Intel GPUs are under single monitoring resource
533+
534+
// all Intel GPUs are under single monitoring resource per KMD
496535
if len(monitor) > 0 {
497-
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, monitor, nil, nil, nil)
498-
devTree.AddDevice(monitorType, monitorID, deviceInfo)
536+
for resourceName, devices := range monitor {
537+
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, devices, nil, nil, nil)
538+
devTree.AddDevice(resourceName, monitorID, deviceInfo)
539+
}
499540
}
500541

501542
if dp.resMan != nil {
502-
dp.resMan.SetDevInfos(rmDevInfos)
503-
dp.resMan.SetTileCountPerCard(tileCounts)
543+
if devProps.drmDriverCount() <= 1 {
544+
dp.resMan.SetDevInfos(rmDevInfos)
545+
546+
if tileCount, err := devProps.maxTileCount(); err == nil {
547+
dp.resMan.SetTileCountPerCard(tileCount)
548+
}
549+
} else {
550+
klog.Warning("Plugin with RM doesn't support multiple DRM drivers:", devProps.drmDrivers)
551+
552+
err := rmWithMultipleDriversErr{}
553+
554+
return nil, err
555+
}
504556
}
505557

506558
return devTree, nil
@@ -521,7 +573,7 @@ func main() {
521573
)
522574

523575
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
524-
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable 'i915_monitoring' (= all GPUs) resource")
576+
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource")
525577
flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management")
526578
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
527579
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")

0 commit comments

Comments
 (0)