xpu-manager sidecar to provide XeLink details to node labels

Fetches xelink topology information from xpu-manager's rest
interface and stores them as labels under NFD's feature.d directory.
NFD then assigns the labels to the node. On exit, sidecar will
remove the label file from disk.

Co-authored-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
Tuomas Katila 2022-12-20 09:43:10 +02:00
parent 1380d24ee9
commit 3922aa111e
8 changed files with 768 additions and 2 deletions

View File

@ -122,6 +122,7 @@ jobs:
- intel-idxd-config-initcontainer
- intel-dlb-plugin
- intel-dlb-initcontainer
- intel-xpumanager-sidecar
# Demo images
- crypto-perf

View File

@ -0,0 +1,63 @@
## This is a generated file, do not edit directly. Edit build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in instead.
##
## Copyright 2022 Intel Corporation. All Rights Reserved.
##
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
## http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
###
ARG CMD=xpumanager_sidecar
## FINAL_BASE can be used to configure the base image of the final image.
##
## This is used in two ways:
## 1) make <image-name> BUILDER=<docker|buildah>
## 2) docker build ... -f <image-name>.Dockerfile
##
## The project default is 1) which sets FINAL_BASE=gcr.io/distroless/static
## (see build-image.sh).
## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based.
## The RedHat build tool does not allow additional image build parameters.
ARG FINAL_BASE=registry.access.redhat.com/ubi8-micro:latest
###
##
## GOLANG_BASE can be used to make the build reproducible by choosing an
## image by its hash:
## GOLANG_BASE=golang@sha256:9d64369fd3c633df71d7465d67d43f63bb31192193e671742fa1c26ebc3a6210
##
## This is used on release branches before tagging a stable version.
## The main branch defaults to using the latest Golang base image.
ARG GOLANG_BASE=golang:1.19-bullseye
###
FROM ${GOLANG_BASE} as builder
ARG DIR=/intel-device-plugins-for-kubernetes
ARG GO111MODULE=on
ARG BUILDFLAGS="-ldflags=-w -s"
ARG GOLICENSES_VERSION
ARG EP=/usr/local/bin/intel_xpumanager_sidecar
ARG CMD
WORKDIR ${DIR}
COPY . .
RUN (cd cmd/${CMD}; GO111MODULE=${GO111MODULE} CGO_ENABLED=0 go install "${BUILDFLAGS}") && install -D /go/bin/${CMD} /install_root${EP}
RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \
&& if [ ! -d "licenses/$CMD" ] ; then \
GO111MODULE=on go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \
--save_path /install_root/licenses/$CMD/go-licenses ; \
else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi
###
FROM ${FINAL_BASE}
COPY --from=builder /install_root /
ENTRYPOINT ["/usr/local/bin/intel_xpumanager_sidecar"]
LABEL vendor='Intel®'
LABEL version='devel'
LABEL release='1'
LABEL name='intel-xpumanager-sidecar'
LABEL summary='Intel® xpumanager sidecar'
LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information'

View File

@ -0,0 +1,8 @@
#define _ENTRYPOINT_ /usr/local/bin/intel_xpumanager_sidecar
ARG CMD=xpumanager_sidecar
#include "default_plugin.docker"
LABEL name='intel-xpumanager-sidecar'
LABEL summary='Intel® xpumanager sidecar'
LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information'

View File

@ -0,0 +1,411 @@
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"bufio"
"bytes"
"context"
"flag"
"fmt"
"io"
"net/http"
"os"
"os/signal"
"path"
"reflect"
"strconv"
"syscall"
"time"
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
"k8s.io/klog/v2"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/prometheus/common/expfmt"
)
type invalidEntryErr struct{}
const (
labelMaxLength = 63
xeLinkLabelName = "xe-links"
pureXeLinkMetricValue = 1
labelControlChar = "Z"
)
type xpuManagerTopologyMatrixCell struct {
LocalDeviceID int
LocalSubdeviceID int
RemoteDeviceID int
RemoteSubdeviceID int
LaneCount int
}
type xpuManagerSidecar struct {
getMetricsData func() []byte
tmpDirPrefix string
dstFilePath string
labelNamespace string
url string
interval uint64
startDelay uint64
xpumPort uint64
laneCount uint64
}
func (e *invalidEntryErr) Error() string {
return "metrics entry was invalid for our use"
}
func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
client := &http.Client{
Timeout: 5 * time.Second,
}
ctx := context.Background()
req, err := http.NewRequestWithContext(ctx, http.MethodGet, xms.url, http.NoBody)
if err != nil {
klog.Error(err.Error())
return nil
}
res, err := client.Do(req)
if err != nil {
klog.Error(err.Error())
return nil
}
resBody, err := io.ReadAll(res.Body)
defer res.Body.Close()
if err != nil {
klog.Error(err.Error())
return nil
}
// Seems /metrics doesn't add new-line at the end of the last entry
// and without this there is an error from TextParser
resBody = append(resBody, "\n"...)
return resBody
}
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
cell := createInvalidTopologyCell()
for _, label := range labels {
name := label.GetName()
strVal := label.GetValue()
klog.V(5).Info(name, " ", strVal)
// xelinks should always be on subdevices
if name == "local_on_subdevice" && strVal != "true" {
return cell, &invalidEntryErr{}
}
valInt, err := strconv.Atoi(strVal)
if err != nil {
continue
}
switch name {
case "local_device_id":
cell.LocalDeviceID = valInt
case "local_subdevice_id":
cell.LocalSubdeviceID = valInt
case "remote_device_id":
cell.RemoteDeviceID = valInt
case "remote_subdevice_id":
cell.RemoteSubdeviceID = valInt
case "lane_count":
fallthrough
case "lan_count":
cell.LaneCount = valInt
}
}
if !isValidTopologyCell(&cell) {
return cell, &invalidEntryErr{}
}
return cell, nil
}
func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyInfos []xpuManagerTopologyMatrixCell) {
reader := bytes.NewReader(data)
var parser expfmt.TextParser
families, err := parser.TextToMetricFamilies(reader)
if err != nil {
klog.Error(err.Error())
return nil
}
for name, family := range families {
klog.V(4).Info("parsing family: " + name)
if name != "xpum_topology_link" {
klog.V(5).Info("... skipping")
continue
}
for _, metric := range family.Metric {
value := -1.0
klog.V(5).Info(metric)
if metric.Gauge != nil {
klog.V(5).Info("metric is of type gauge")
value = *metric.Gauge.Value
} else if metric.Untyped != nil {
klog.V(5).Info("metric is of type untyped")
value = *metric.Untyped.Value
} else {
klog.Warningf("Unknown/unsupported metric type: %v", metric)
}
if value != pureXeLinkMetricValue {
klog.V(5).Info("... not xelink")
continue
}
cell, err := processMetricsLabels(metric.Label)
if err == nil {
klog.V(5).Info("topology entry: ", cell)
topologyInfos = append(topologyInfos, cell)
}
}
}
klog.V(5).Info("topology entries: ", len(topologyInfos))
return topologyInfos
}
func (xms *xpuManagerSidecar) iterate() {
metricsData := xms.getMetricsData()
topologyInfos := xms.GetTopologyFromXPUMMetrics(metricsData)
labels := xms.createLabels(topologyInfos)
if !xms.compareLabels(labels) {
xms.writeLabels(labels)
} else {
klog.V(2).Info("labels have not changed")
}
}
// TODO: Move this function under internal/pluginutils.
func (xms *xpuManagerSidecar) writeLabels(labels []string) {
root, err := os.MkdirTemp(xms.tmpDirPrefix, "xpumsidecar")
if err != nil {
klog.Errorf("can't create temporary directory: %+v", err)
return
}
defer os.RemoveAll(root)
filePath := path.Join(root, "xpum-sidecar-labels.txt")
file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
klog.Error(err.Error())
}
writer := bufio.NewWriter(file)
for _, label := range labels {
_, _ = writer.WriteString(label + "\n")
}
writer.Flush()
file.Close()
// move tmp file to dst file
err = os.Rename(filePath, xms.dstFilePath)
if err != nil {
klog.Errorf("Failed to rename tmp file to dst file: %+v", err)
return
}
for _, label := range labels {
klog.V(2).Infof("%v\n", label)
}
}
// compareLabels returns true, if the labels at dstFilePath are equal to given labels.
func (xms *xpuManagerSidecar) compareLabels(labels []string) bool {
file, err := os.OpenFile(xms.dstFilePath, os.O_RDONLY, 0644)
if err != nil {
return false
}
fileLabels := []string{}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
fileLabels = append(fileLabels, scanner.Text())
}
return reflect.DeepEqual(fileLabels, labels)
}
func createInvalidTopologyCell() xpuManagerTopologyMatrixCell {
cell := xpuManagerTopologyMatrixCell{}
cell.LaneCount = -1
cell.LocalDeviceID = -1
cell.LocalSubdeviceID = -1
cell.RemoteDeviceID = -1
cell.RemoteSubdeviceID = -1
return cell
}
func isValidTopologyCell(cell *xpuManagerTopologyMatrixCell) bool {
return (cell.LaneCount >= 0 && cell.LocalDeviceID >= 0 &&
cell.LocalSubdeviceID >= 0 && cell.RemoteDeviceID >= 0 &&
cell.RemoteSubdeviceID >= 0)
}
func (xms *xpuManagerSidecar) createLabels(topologyInfos []xpuManagerTopologyMatrixCell) []string {
links := ""
separator := ""
submitted := map[string]int{}
cellToString := func(cell xpuManagerTopologyMatrixCell) string {
if cell.LocalDeviceID < cell.RemoteDeviceID {
return strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID) + "-" +
strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID)
}
return strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID) + "-" +
strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID)
}
for _, ti := range topologyInfos {
if ti.LaneCount < int(xms.laneCount) {
continue
}
linkString := cellToString(ti)
count, found := submitted[linkString]
if !found {
links += separator + linkString
separator = "_"
}
count++
if count > 2 {
klog.Warningf("Duplicate links found for: %s (lane count: %d)", linkString, ti.LaneCount)
}
submitted[linkString] = count
}
splitLinks := pluginutils.SplitAtLastAlphaNum(links, labelMaxLength, labelControlChar)
labels := []string{}
if len(splitLinks) == 0 {
return labels
}
labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+"="+splitLinks[0])
for i := 1; i < len(splitLinks); i++ {
labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+strconv.FormatInt(int64(i+1), 10)+"="+splitLinks[i])
}
return labels
}
func createXPUManagerSidecar() *xpuManagerSidecar {
xms := xpuManagerSidecar{}
xms.getMetricsData = xms.getMetricsDataFromXPUM
return &xms
}
func main() {
xms := createXPUManagerSidecar()
flag.Uint64Var(&xms.interval, "interval", 10, "interval for topology fetching and label writing (seconds, >= 1)")
flag.Uint64Var(&xms.startDelay, "startup-delay", 10, "startup delay for first topology fetching and label writing (seconds, >= 0)")
flag.Uint64Var(&xms.xpumPort, "xpum-port", 29999, "xpumanager port number")
flag.StringVar(&xms.tmpDirPrefix, "tmp-dir-prefix", "/etc/kubernetes/node-feature-discovery/features.d/", "location prefix for a temporary directory (used to store in-flight label file)")
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
klog.InitFlags(nil)
flag.Parse()
if xms.interval == 0 {
klog.Fatal("zero interval won't work, set it to at least 1")
}
xms.url = fmt.Sprintf("http://127.0.0.1:%d/metrics", xms.xpumPort)
keepIterating := true
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
signal.Notify(c, syscall.SIGTERM)
time.Sleep(time.Duration(xms.startDelay) * time.Second)
for keepIterating {
xms.iterate()
timeout := time.After(time.Duration(xms.interval) * time.Second)
select {
case <-timeout:
continue
case <-c:
klog.V(2).Info("Interrupt received")
keepIterating = false
}
}
klog.V(2).Info("Removing label file")
err := os.Remove(xms.dstFilePath)
if err != nil {
klog.Errorf("Failed to cleanup label file: %+v", err)
}
klog.V(2).Info("Stopping sidecar")
}

View File

@ -0,0 +1,243 @@
// Copyright 2022 Intel Corporation. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"os"
"path/filepath"
"reflect"
"strings"
"testing"
)
type testCase struct {
name string
metricsData []string
expectedLabels []string
minLaneCount int
}
func createTestCases() []testCase {
return []testCase{
{
name: "Garbage metrics",
minLaneCount: 4,
metricsData: []string{
`xpum_some_other_data{with_some_label]]]]}`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
},
{
name: "No xelinks reported",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_some_other_data{with_some_label="foo"} 42`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 4`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
},
{
name: "Xelinks not on sub devices",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
},
{
name: "Xelinks without lan counts",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1.0`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
},
{
name: "One xelink",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"},
},
{
name: "One xelink with non xelink",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{local_device_id="99",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"},
},
{
name: "Cross linked subdevs",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="1",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.1_0.1-1.0"},
},
{
name: "One to many",
minLaneCount: 4,
metricsData: []string{
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="2", lane_count="4"} 1`,
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-2.2_0.0-3.0"},
},
{
name: "Many to many",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1"},
},
{
name: "Too few lanes",
minLaneCount: 8,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
"",
},
expectedLabels: []string{"xpumanager.intel.com/xe-links=1.0-3.0_0.1-3.1"},
},
{
name: "Multi line label",
minLaneCount: 4,
metricsData: []string{
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
`# TYPE xpum_topology_link gauge`,
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
`xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
`xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
"",
},
expectedLabels: []string{
"xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1_2.0-4.0_3.0-4.1_0.1-5.0_1.1-5.1",
"xpumanager.intel.com/xe-links2=Z_2.0-6.0_3.0-6.1_0.1-7.0_1.1-7.1_2.0-8.0_3.0-8.1_0.1-9.0_1.1-9",
"xpumanager.intel.com/xe-links3=Z.1",
},
},
}
}
func (tc *testCase) createFakeXMS(data []string, minLaneCount int) *xpuManagerSidecar {
bytes := []byte(strings.Join(data, "\n"))
metricsGetter := func() []byte {
return bytes
}
xms := createXPUManagerSidecar()
xms.getMetricsData = metricsGetter
xms.laneCount = uint64(minLaneCount)
xms.labelNamespace = "xpumanager.intel.com"
return xms
}
func TestLabeling(t *testing.T) {
tcs := createTestCases()
for _, tc := range tcs {
print("Testcase (labeling): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
labels := xms.createLabels(topologyInfos)
if !reflect.DeepEqual(labels, tc.expectedLabels) {
t.Errorf("got %v expected %v\n", labels, tc.expectedLabels)
}
}
}
func TestIterate(t *testing.T) {
tcs := createTestCases()
for _, tc := range tcs {
print("Testcase (iterate): ", tc.name, "\n")
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
root, err := os.MkdirTemp("", "test_new_xms")
if err != nil {
t.Fatalf("can't create temporary directory: %+v", err)
}
// dirs/files need to be removed for the next test
defer os.RemoveAll(root)
xms.tmpDirPrefix = root
xms.dstFilePath = filepath.Join(root, "labels.txt")
xms.iterate()
if !xms.compareLabels(tc.expectedLabels) {
t.Errorf("output file didn't have expected labels\n")
}
}
}

View File

@ -0,0 +1,34 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: intel-xpumanager
name: intel-xpumanager
spec:
template:
spec:
volumes:
- name: features-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d/"
containers:
- name: xelink-sidecar
image: intel/intel-xpumanager-sidecar:devel
imagePullPolicy: Always
args:
- -v=2
volumeMounts:
- name: features-d
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsUser: 0
- name: xpumd
resources:
limits:
$patch: replace
gpu.intel.com/i915_monitoring: 1

View File

@ -0,0 +1,6 @@
resources:
# XeLink topology information is only available from >= 1.x.y release
- https://raw.githubusercontent.com/intel/xpumanager/v1.2.0_golden/deployment/kubernetes/daemonset-intel-xpum.yaml
namespace: monitoring
patchesStrategicMerge:
- kustom/kustom_xpumanager.yaml

4
go.mod
View File

@ -12,6 +12,8 @@ require (
github.com/onsi/ginkgo/v2 v2.6.0
github.com/onsi/gomega v1.24.1
github.com/pkg/errors v0.9.1
github.com/prometheus/client_model v0.3.0
github.com/prometheus/common v0.37.0
golang.org/x/sys v0.4.0
golang.org/x/text v0.6.0
google.golang.org/grpc v1.51.0
@ -63,8 +65,6 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/opencontainers/go-digest v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/spf13/cobra v1.6.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect