xpum sidecar: allow xelinks that are not tied to subdevices

With one tile GPUs, xelinks are no longer advertised to
be on subdevices.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2023-06-16 10:59:47 +03:00
parent 53310c2e03
commit e34e93bd64
2 changed files with 39 additions and 18 deletions

View File

@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct {
} }
type xpuManagerSidecar struct { type xpuManagerSidecar struct {
getMetricsData func() []byte getMetricsData func() []byte
tmpDirPrefix string tmpDirPrefix string
dstFilePath string dstFilePath string
labelNamespace string labelNamespace string
url string url string
interval uint64 interval uint64
startDelay uint64 startDelay uint64
xpumPort uint64 xpumPort uint64
laneCount uint64 laneCount uint64
allowSubdevicelessLinks bool
} }
func (e *invalidEntryErr) Error() string { func (e *invalidEntryErr) Error() string {
@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
return resBody return resBody
} }
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) { func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
cell := createInvalidTopologyCell() cell := createInvalidTopologyCell()
for _, label := range labels { for _, label := range labels {
@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
klog.V(5).Info(name, " ", strVal) klog.V(5).Info(name, " ", strVal)
// xelinks should always be on subdevices // xelinks should always be on subdevices
if name == "local_on_subdevice" && strVal != "true" { if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
return cell, &invalidEntryErr{} return cell, &invalidEntryErr{}
} }
@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
continue continue
} }
cell, err := processMetricsLabels(metric.Label) cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
if err == nil { if err == nil {
klog.V(5).Info("topology entry: ", cell) klog.V(5).Info("topology entry: ", cell)
topologyInfos = append(topologyInfos, cell) topologyInfos = append(topologyInfos, cell)
@ -367,6 +368,7 @@ func main() {
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination") flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink") flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels") flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
klog.InitFlags(nil) klog.InitFlags(nil)
flag.Parse() flag.Parse()

View File

@ -23,10 +23,11 @@ import (
) )
type testCase struct { type testCase struct {
name string name string
metricsData []string metricsData []string
expectedLabels []string expectedLabels []string
minLaneCount int minLaneCount int
allowSubdeviceless bool
} }
func createTestCases() []testCase { func createTestCases() []testCase {
@ -59,12 +60,25 @@ func createTestCases() []testCase {
metricsData: []string{ metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`, `# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`, `# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`, `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`, `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
"", "",
}, },
expectedLabels: []string{"xpumanager.intel.com/xe-links="}, expectedLabels: []string{"xpumanager.intel.com/xe-links="},
}, },
{
name: "Xelinks not on sub devices when it's allowed",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
allowSubdeviceless: true,
},
{ {
name: "Xelinks without lan counts", name: "Xelinks without lan counts",
minLaneCount: 4, minLaneCount: 4,
@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
for _, tc := range tcs { for _, tc := range tcs {
print("Testcase (labeling): ", tc.name, "\n") print("Testcase (labeling): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n"))) topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
labels := xms.createLabels(topologyInfos) labels := xms.createLabels(topologyInfos)
@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
print("Testcase (iterate): ", tc.name, "\n") print("Testcase (iterate): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
root, err := os.MkdirTemp("", "test_new_xms") root, err := os.MkdirTemp("", "test_new_xms")
if err != nil { if err != nil {
t.Fatalf("can't create temporary directory: %+v", err) t.Fatalf("can't create temporary directory: %+v", err)