mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
xpum sidecar: allow xelinks that are not tied to subdevices
With one tile GPUs, xelinks are no longer advertised to be on subdevices. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
53310c2e03
commit
e34e93bd64
@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct {
|
||||
}
|
||||
|
||||
type xpuManagerSidecar struct {
|
||||
getMetricsData func() []byte
|
||||
tmpDirPrefix string
|
||||
dstFilePath string
|
||||
labelNamespace string
|
||||
url string
|
||||
interval uint64
|
||||
startDelay uint64
|
||||
xpumPort uint64
|
||||
laneCount uint64
|
||||
getMetricsData func() []byte
|
||||
tmpDirPrefix string
|
||||
dstFilePath string
|
||||
labelNamespace string
|
||||
url string
|
||||
interval uint64
|
||||
startDelay uint64
|
||||
xpumPort uint64
|
||||
laneCount uint64
|
||||
allowSubdevicelessLinks bool
|
||||
}
|
||||
|
||||
func (e *invalidEntryErr) Error() string {
|
||||
@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
|
||||
return resBody
|
||||
}
|
||||
|
||||
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
|
||||
func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
|
||||
cell := createInvalidTopologyCell()
|
||||
|
||||
for _, label := range labels {
|
||||
@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
|
||||
klog.V(5).Info(name, " ", strVal)
|
||||
|
||||
// xelinks should always be on subdevices
|
||||
if name == "local_on_subdevice" && strVal != "true" {
|
||||
if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
|
||||
return cell, &invalidEntryErr{}
|
||||
}
|
||||
|
||||
@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
|
||||
continue
|
||||
}
|
||||
|
||||
cell, err := processMetricsLabels(metric.Label)
|
||||
cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
|
||||
if err == nil {
|
||||
klog.V(5).Info("topology entry: ", cell)
|
||||
topologyInfos = append(topologyInfos, cell)
|
||||
@ -367,6 +368,7 @@ func main() {
|
||||
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
|
||||
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
|
||||
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
|
||||
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
|
||||
klog.InitFlags(nil)
|
||||
|
||||
flag.Parse()
|
||||
|
@ -23,10 +23,11 @@ import (
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
name string
|
||||
metricsData []string
|
||||
expectedLabels []string
|
||||
minLaneCount int
|
||||
name string
|
||||
metricsData []string
|
||||
expectedLabels []string
|
||||
minLaneCount int
|
||||
allowSubdeviceless bool
|
||||
}
|
||||
|
||||
func createTestCases() []testCase {
|
||||
@ -59,12 +60,25 @@ func createTestCases() []testCase {
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||
},
|
||||
{
|
||||
name: "Xelinks not on sub devices when it's allowed",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
|
||||
allowSubdeviceless: true,
|
||||
},
|
||||
{
|
||||
name: "Xelinks without lan counts",
|
||||
minLaneCount: 4,
|
||||
@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
|
||||
for _, tc := range tcs {
|
||||
print("Testcase (labeling): ", tc.name, "\n")
|
||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||
|
||||
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
|
||||
|
||||
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
|
||||
|
||||
labels := xms.createLabels(topologyInfos)
|
||||
@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
|
||||
print("Testcase (iterate): ", tc.name, "\n")
|
||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||
|
||||
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
|
||||
|
||||
root, err := os.MkdirTemp("", "test_new_xms")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
|
Loading…
Reference in New Issue
Block a user