mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
xpum sidecar: allow xelinks that are not tied to subdevices
With one tile GPUs, xelinks are no longer advertised to be on subdevices. Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
53310c2e03
commit
e34e93bd64
@ -55,15 +55,16 @@ type xpuManagerTopologyMatrixCell struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type xpuManagerSidecar struct {
|
type xpuManagerSidecar struct {
|
||||||
getMetricsData func() []byte
|
getMetricsData func() []byte
|
||||||
tmpDirPrefix string
|
tmpDirPrefix string
|
||||||
dstFilePath string
|
dstFilePath string
|
||||||
labelNamespace string
|
labelNamespace string
|
||||||
url string
|
url string
|
||||||
interval uint64
|
interval uint64
|
||||||
startDelay uint64
|
startDelay uint64
|
||||||
xpumPort uint64
|
xpumPort uint64
|
||||||
laneCount uint64
|
laneCount uint64
|
||||||
|
allowSubdevicelessLinks bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func (e *invalidEntryErr) Error() string {
|
func (e *invalidEntryErr) Error() string {
|
||||||
@ -108,7 +109,7 @@ func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
|
|||||||
return resBody
|
return resBody
|
||||||
}
|
}
|
||||||
|
|
||||||
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
|
func processMetricsLabels(labels []*io_prometheus_client.LabelPair, allowNonSubdeviceLinks bool) (xpuManagerTopologyMatrixCell, error) {
|
||||||
cell := createInvalidTopologyCell()
|
cell := createInvalidTopologyCell()
|
||||||
|
|
||||||
for _, label := range labels {
|
for _, label := range labels {
|
||||||
@ -118,7 +119,7 @@ func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerT
|
|||||||
klog.V(5).Info(name, " ", strVal)
|
klog.V(5).Info(name, " ", strVal)
|
||||||
|
|
||||||
// xelinks should always be on subdevices
|
// xelinks should always be on subdevices
|
||||||
if name == "local_on_subdevice" && strVal != "true" {
|
if !allowNonSubdeviceLinks && name == "local_on_subdevice" && strVal != "true" {
|
||||||
return cell, &invalidEntryErr{}
|
return cell, &invalidEntryErr{}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -193,7 +194,7 @@ func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyI
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
cell, err := processMetricsLabels(metric.Label)
|
cell, err := processMetricsLabels(metric.Label, xms.allowSubdevicelessLinks)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
klog.V(5).Info("topology entry: ", cell)
|
klog.V(5).Info("topology entry: ", cell)
|
||||||
topologyInfos = append(topologyInfos, cell)
|
topologyInfos = append(topologyInfos, cell)
|
||||||
@ -367,6 +368,7 @@ func main() {
|
|||||||
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
|
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
|
||||||
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
|
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
|
||||||
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
|
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
|
||||||
|
flag.BoolVar(&xms.allowSubdevicelessLinks, "allow-subdeviceless-links", false, "allow xelinks that are not tied to subdevices (=1 tile GPUs)")
|
||||||
klog.InitFlags(nil)
|
klog.InitFlags(nil)
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
@ -23,10 +23,11 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type testCase struct {
|
type testCase struct {
|
||||||
name string
|
name string
|
||||||
metricsData []string
|
metricsData []string
|
||||||
expectedLabels []string
|
expectedLabels []string
|
||||||
minLaneCount int
|
minLaneCount int
|
||||||
|
allowSubdeviceless bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func createTestCases() []testCase {
|
func createTestCases() []testCase {
|
||||||
@ -59,12 +60,25 @@ func createTestCases() []testCase {
|
|||||||
metricsData: []string{
|
metricsData: []string{
|
||||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||||
`# TYPE xpum_topology_link gauge`,
|
`# TYPE xpum_topology_link gauge`,
|
||||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
|
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
|
||||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
|
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
|
||||||
"",
|
"",
|
||||||
},
|
},
|
||||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "Xelinks not on sub devices when it's allowed",
|
||||||
|
minLaneCount: 4,
|
||||||
|
metricsData: []string{
|
||||||
|
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||||
|
`# TYPE xpum_topology_link gauge`,
|
||||||
|
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0",lane_count="4"} 1`,
|
||||||
|
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1",lane_count="4"} 1`,
|
||||||
|
"",
|
||||||
|
},
|
||||||
|
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-1.1"},
|
||||||
|
allowSubdeviceless: true,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "Xelinks without lan counts",
|
name: "Xelinks without lan counts",
|
||||||
minLaneCount: 4,
|
minLaneCount: 4,
|
||||||
@ -208,6 +222,9 @@ func TestLabeling(t *testing.T) {
|
|||||||
for _, tc := range tcs {
|
for _, tc := range tcs {
|
||||||
print("Testcase (labeling): ", tc.name, "\n")
|
print("Testcase (labeling): ", tc.name, "\n")
|
||||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||||
|
|
||||||
|
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
|
||||||
|
|
||||||
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
|
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
|
||||||
|
|
||||||
labels := xms.createLabels(topologyInfos)
|
labels := xms.createLabels(topologyInfos)
|
||||||
@ -224,6 +241,8 @@ func TestIterate(t *testing.T) {
|
|||||||
print("Testcase (iterate): ", tc.name, "\n")
|
print("Testcase (iterate): ", tc.name, "\n")
|
||||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||||
|
|
||||||
|
xms.allowSubdevicelessLinks = tc.allowSubdeviceless
|
||||||
|
|
||||||
root, err := os.MkdirTemp("", "test_new_xms")
|
root, err := os.MkdirTemp("", "test_new_xms")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("can't create temporary directory: %+v", err)
|
t.Fatalf("can't create temporary directory: %+v", err)
|
||||||
|
Loading…
Reference in New Issue
Block a user