xpum sidecar: allow xelinks that are not tied to subdevices

With one tile GPUs, xelinks are no longer advertised to
be on subdevices.

Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2023-06-16 10:59:47 +03:00
parent 53310c2e03
commit e34e93bd64
2 changed files with 39 additions and 18 deletions

View File

@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct {
}
type xpuManagerSidecar struct {
getMetricsData func() []byte
tmpDirPrefix string
dstFilePath string
labelNamespace string
url string
interval uint64
startDelay uint64
xpumPort uint64
laneCount uint64
getMetricsData func() []byte
tmpDirPrefix string
dstFilePath string
labelNamespace string
url string
interval uint64
startDelay uint64
xpumPort uint64
laneCount uint64
allowSubdevicelessLinks bool
}
func (e *invalidEntryErr) Error() string {
@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
return resBody
}
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
cell := createInvalidTopologyCell()
for _, label := range labels {
@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
klog.V(5).Info(name, " ", strVal)
// xelinks should always be on subdevices
if name == "local_on_subdevice" && strVal != "true" {
if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
return cell, &invalidEntryErr{}
}
@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
continue
}
cell, err := processMetricsLabels(metric.Label)
cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
if err == nil {
klog.V(5).Info("topology entry: ", cell)
topologyInfos = append(topologyInfos, cell)
@ -367,6 +368,7 @@ func main() {
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
klog.InitFlags(nil)
flag.Parse()

View File

@ -23,10 +23,11 @@ import (
)
type testCase struct {
name string
metricsData []string
expectedLabels []string
minLaneCount int
name string
metricsData []string
expectedLabels []string
minLaneCount int
allowSubdeviceless bool
}
func createTestCases() []testCase {
@ -59,12 +60,25 @@ func createTestCases() []testCase {
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
},
{
name: "Xelinks not on sub devices when it's allowed",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
allowSubdeviceless: true,
},
{
name: "Xelinks without lan counts",
minLaneCount: 4,
@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
for _, tc := range tcs {
print("Testcase (labeling): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
labels := xms.createLabels(topologyInfos)
@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
print("Testcase (iterate): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
root, err := os.MkdirTemp("", "test_new_xms")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)