diff --git a/cmd/xpumanager_sidecar/main.go b/cmd/xpumanager_sidecar/main.go index bf608f6f..b28d4320 100644 --- a/cmd/xpumanager_sidecar/main.go +++ b/cmd/xpumanager_sidecar/main.go @@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct { } type xpuManagerSidecar struct { - getMetricsData func() []byte - tmpDirPrefix string - dstFilePath string - labelNamespace string - url string - interval uint64 - startDelay uint64 - xpumPort uint64 - laneCount uint64 + getMetricsData func() []byte + tmpDirPrefix string + dstFilePath string + labelNamespace string + url string + interval uint64 + startDelay uint64 + xpumPort uint64 + laneCount uint64 + allowSubdevicelessLinks bool } func (e *invalidEntryErr) Error() string { @@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte { return resBody } -func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) { +func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) { cell := createInvalidTopologyCell() for _, label := range labels { @@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT klog.V(5).Info(name, " ", strVal) // xelinks should always be on subdevices - if name == "local_on_subdevice" && strVal != "true" { + if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" { return cell, &invalidEntryErr{} } @@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI continue } - cell, err := processMetricsLabels(metric.Label) + cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks) if err == nil { klog.V(5).Info("topology entry: ", cell) topologyInfos = append(topologyInfos, cell) @@ -367,6 +368,7 @@ func main() { flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination") flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink") flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels") + flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)") klog.InitFlags(nil) flag.Parse() diff --git a/cmd/xpumanager_sidecar/main_test.go b/cmd/xpumanager_sidecar/main_test.go index 6c8785d1..5c36c273 100644 --- a/cmd/xpumanager_sidecar/main_test.go +++ b/cmd/xpumanager_sidecar/main_test.go @@ -23,10 +23,11 @@ import ( ) type testCase struct { - name string - metricsData []string - expectedLabels []string - minLaneCount int + name string + metricsData []string + expectedLabels []string + minLaneCount int + allowSubdeviceless bool } func createTestCases() []testCase { @@ -59,12 +60,25 @@ func createTestCases() []testCase { metricsData: []string{ `# HELP xpum_topology_link Connection type fo two GPU tiles`, `# TYPE xpum_topology_link gauge`, - `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`, - `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`, "", }, expectedLabels: []string{"xpumanager.intel.com/xe-links="}, }, + { + name: "Xelinks not on sub devices when it's allowed", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"}, + allowSubdeviceless: true, + }, { name: "Xelinks without lan counts", minLaneCount: 4, @@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) { for _, tc := range tcs { print("Testcase (labeling): ", tc.name, "\n") xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) + + xms.allowSubdevicelessLinks = tc.allowSubdeviceless + topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n"))) labels := xms.createLabels(topologyInfos) @@ -224,6 +241,8 @@ func TestIterate(t *testing.T) { print("Testcase (iterate): ", tc.name, "\n") xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) + xms.allowSubdevicelessLinks = tc.allowSubdeviceless + root, err := os.MkdirTemp("", "test_new_xms") if err != nil { t.Fatalf("can't create temporary directory: %+v", err)