mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
xpu-manager sidecar to provide XeLink details to node labels
Fetches xelink topology information from xpu-manager's rest interface and stores them as labels under NFD's feature.d directory. NFD then assigns the labels to the node. On exit, sidecar will remove the label file from disk. Co-authored-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com> Signed-off-by: Tuomas Katila <tuomas.katila@intel.com>
This commit is contained in:
parent
1380d24ee9
commit
3922aa111e
1
.github/workflows/ci.yaml
vendored
1
.github/workflows/ci.yaml
vendored
@ -122,6 +122,7 @@ jobs:
|
||||
- intel-idxd-config-initcontainer
|
||||
- intel-dlb-plugin
|
||||
- intel-dlb-initcontainer
|
||||
- intel-xpumanager-sidecar
|
||||
|
||||
# Demo images
|
||||
- crypto-perf
|
||||
|
63
build/docker/intel-xpumanager-sidecar.Dockerfile
Normal file
63
build/docker/intel-xpumanager-sidecar.Dockerfile
Normal file
@ -0,0 +1,63 @@
|
||||
## This is a generated file, do not edit directly. Edit build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in instead.
|
||||
##
|
||||
## Copyright 2022 Intel Corporation. All Rights Reserved.
|
||||
##
|
||||
## Licensed under the Apache License, Version 2.0 (the "License");
|
||||
## you may not use this file except in compliance with the License.
|
||||
## You may obtain a copy of the License at
|
||||
##
|
||||
## http://www.apache.org/licenses/LICENSE-2.0
|
||||
##
|
||||
## Unless required by applicable law or agreed to in writing, software
|
||||
## distributed under the License is distributed on an "AS IS" BASIS,
|
||||
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
## See the License for the specific language governing permissions and
|
||||
## limitations under the License.
|
||||
###
|
||||
ARG CMD=xpumanager_sidecar
|
||||
## FINAL_BASE can be used to configure the base image of the final image.
|
||||
##
|
||||
## This is used in two ways:
|
||||
## 1) make <image-name> BUILDER=<docker|buildah>
|
||||
## 2) docker build ... -f <image-name>.Dockerfile
|
||||
##
|
||||
## The project default is 1) which sets FINAL_BASE=gcr.io/distroless/static
|
||||
## (see build-image.sh).
|
||||
## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based.
|
||||
## The RedHat build tool does not allow additional image build parameters.
|
||||
ARG FINAL_BASE=registry.access.redhat.com/ubi8-micro:latest
|
||||
###
|
||||
##
|
||||
## GOLANG_BASE can be used to make the build reproducible by choosing an
|
||||
## image by its hash:
|
||||
## GOLANG_BASE=golang@sha256:9d64369fd3c633df71d7465d67d43f63bb31192193e671742fa1c26ebc3a6210
|
||||
##
|
||||
## This is used on release branches before tagging a stable version.
|
||||
## The main branch defaults to using the latest Golang base image.
|
||||
ARG GOLANG_BASE=golang:1.19-bullseye
|
||||
###
|
||||
FROM ${GOLANG_BASE} as builder
|
||||
ARG DIR=/intel-device-plugins-for-kubernetes
|
||||
ARG GO111MODULE=on
|
||||
ARG BUILDFLAGS="-ldflags=-w -s"
|
||||
ARG GOLICENSES_VERSION
|
||||
ARG EP=/usr/local/bin/intel_xpumanager_sidecar
|
||||
ARG CMD
|
||||
WORKDIR ${DIR}
|
||||
COPY . .
|
||||
RUN (cd cmd/${CMD}; GO111MODULE=${GO111MODULE} CGO_ENABLED=0 go install "${BUILDFLAGS}") && install -D /go/bin/${CMD} /install_root${EP}
|
||||
RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \
|
||||
&& if [ ! -d "licenses/$CMD" ] ; then \
|
||||
GO111MODULE=on go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \
|
||||
--save_path /install_root/licenses/$CMD/go-licenses ; \
|
||||
else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi
|
||||
###
|
||||
FROM ${FINAL_BASE}
|
||||
COPY --from=builder /install_root /
|
||||
ENTRYPOINT ["/usr/local/bin/intel_xpumanager_sidecar"]
|
||||
LABEL vendor='Intel®'
|
||||
LABEL version='devel'
|
||||
LABEL release='1'
|
||||
LABEL name='intel-xpumanager-sidecar'
|
||||
LABEL summary='Intel® xpumanager sidecar'
|
||||
LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information'
|
@ -0,0 +1,8 @@
|
||||
#define _ENTRYPOINT_ /usr/local/bin/intel_xpumanager_sidecar
|
||||
ARG CMD=xpumanager_sidecar
|
||||
|
||||
#include "default_plugin.docker"
|
||||
|
||||
LABEL name='intel-xpumanager-sidecar'
|
||||
LABEL summary='Intel® xpumanager sidecar'
|
||||
LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information'
|
411
cmd/xpumanager_sidecar/main.go
Normal file
411
cmd/xpumanager_sidecar/main.go
Normal file
@ -0,0 +1,411 @@
|
||||
// Copyright 2022 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path"
|
||||
"reflect"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
|
||||
"k8s.io/klog/v2"
|
||||
|
||||
io_prometheus_client "github.com/prometheus/client_model/go"
|
||||
"github.com/prometheus/common/expfmt"
|
||||
)
|
||||
|
||||
type invalidEntryErr struct{}
|
||||
|
||||
const (
|
||||
labelMaxLength = 63
|
||||
xeLinkLabelName = "xe-links"
|
||||
pureXeLinkMetricValue = 1
|
||||
labelControlChar = "Z"
|
||||
)
|
||||
|
||||
type xpuManagerTopologyMatrixCell struct {
|
||||
LocalDeviceID int
|
||||
LocalSubdeviceID int
|
||||
RemoteDeviceID int
|
||||
RemoteSubdeviceID int
|
||||
LaneCount int
|
||||
}
|
||||
|
||||
type xpuManagerSidecar struct {
|
||||
getMetricsData func() []byte
|
||||
tmpDirPrefix string
|
||||
dstFilePath string
|
||||
labelNamespace string
|
||||
url string
|
||||
interval uint64
|
||||
startDelay uint64
|
||||
xpumPort uint64
|
||||
laneCount uint64
|
||||
}
|
||||
|
||||
func (e *invalidEntryErr) Error() string {
|
||||
return "metrics entry was invalid for our use"
|
||||
}
|
||||
|
||||
func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte {
|
||||
client := &http.Client{
|
||||
Timeout: 5 * time.Second,
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodGet, xms.url, http.NoBody)
|
||||
if err != nil {
|
||||
klog.Error(err.Error())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
res, err := client.Do(req)
|
||||
if err != nil {
|
||||
klog.Error(err.Error())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
resBody, err := io.ReadAll(res.Body)
|
||||
|
||||
defer res.Body.Close()
|
||||
|
||||
if err != nil {
|
||||
klog.Error(err.Error())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Seems /metrics doesn't add new-line at the end of the last entry
|
||||
// and without this there is an error from TextParser
|
||||
resBody = append(resBody, "\n"...)
|
||||
|
||||
return resBody
|
||||
}
|
||||
|
||||
func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) {
|
||||
cell := createInvalidTopologyCell()
|
||||
|
||||
for _, label := range labels {
|
||||
name := label.GetName()
|
||||
strVal := label.GetValue()
|
||||
|
||||
klog.V(5).Info(name, " ", strVal)
|
||||
|
||||
// xelinks should always be on subdevices
|
||||
if name == "local_on_subdevice" && strVal != "true" {
|
||||
return cell, &invalidEntryErr{}
|
||||
}
|
||||
|
||||
valInt, err := strconv.Atoi(strVal)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
switch name {
|
||||
case "local_device_id":
|
||||
cell.LocalDeviceID = valInt
|
||||
case "local_subdevice_id":
|
||||
cell.LocalSubdeviceID = valInt
|
||||
case "remote_device_id":
|
||||
cell.RemoteDeviceID = valInt
|
||||
case "remote_subdevice_id":
|
||||
cell.RemoteSubdeviceID = valInt
|
||||
case "lane_count":
|
||||
fallthrough
|
||||
case "lan_count":
|
||||
cell.LaneCount = valInt
|
||||
}
|
||||
}
|
||||
|
||||
if !isValidTopologyCell(&cell) {
|
||||
return cell, &invalidEntryErr{}
|
||||
}
|
||||
|
||||
return cell, nil
|
||||
}
|
||||
|
||||
func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyInfos []xpuManagerTopologyMatrixCell) {
|
||||
reader := bytes.NewReader(data)
|
||||
|
||||
var parser expfmt.TextParser
|
||||
families, err := parser.TextToMetricFamilies(reader)
|
||||
|
||||
if err != nil {
|
||||
klog.Error(err.Error())
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
for name, family := range families {
|
||||
klog.V(4).Info("parsing family: " + name)
|
||||
|
||||
if name != "xpum_topology_link" {
|
||||
klog.V(5).Info("... skipping")
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
for _, metric := range family.Metric {
|
||||
value := -1.0
|
||||
|
||||
klog.V(5).Info(metric)
|
||||
|
||||
if metric.Gauge != nil {
|
||||
klog.V(5).Info("metric is of type gauge")
|
||||
|
||||
value = *metric.Gauge.Value
|
||||
} else if metric.Untyped != nil {
|
||||
klog.V(5).Info("metric is of type untyped")
|
||||
value = *metric.Untyped.Value
|
||||
} else {
|
||||
klog.Warningf("Unknown/unsupported metric type: %v", metric)
|
||||
}
|
||||
|
||||
if value != pureXeLinkMetricValue {
|
||||
klog.V(5).Info("... not xelink")
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
cell, err := processMetricsLabels(metric.Label)
|
||||
if err == nil {
|
||||
klog.V(5).Info("topology entry: ", cell)
|
||||
topologyInfos = append(topologyInfos, cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
klog.V(5).Info("topology entries: ", len(topologyInfos))
|
||||
|
||||
return topologyInfos
|
||||
}
|
||||
|
||||
func (xms *xpuManagerSidecar) iterate() {
|
||||
metricsData := xms.getMetricsData()
|
||||
topologyInfos := xms.GetTopologyFromXPUMMetrics(metricsData)
|
||||
|
||||
labels := xms.createLabels(topologyInfos)
|
||||
|
||||
if !xms.compareLabels(labels) {
|
||||
xms.writeLabels(labels)
|
||||
} else {
|
||||
klog.V(2).Info("labels have not changed")
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Move this function under internal/pluginutils.
|
||||
func (xms *xpuManagerSidecar) writeLabels(labels []string) {
|
||||
root, err := os.MkdirTemp(xms.tmpDirPrefix, "xpumsidecar")
|
||||
if err != nil {
|
||||
klog.Errorf("can't create temporary directory: %+v", err)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
filePath := path.Join(root, "xpum-sidecar-labels.txt")
|
||||
|
||||
file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
||||
|
||||
if err != nil {
|
||||
klog.Error(err.Error())
|
||||
}
|
||||
|
||||
writer := bufio.NewWriter(file)
|
||||
|
||||
for _, label := range labels {
|
||||
_, _ = writer.WriteString(label + "\n")
|
||||
}
|
||||
|
||||
writer.Flush()
|
||||
file.Close()
|
||||
|
||||
// move tmp file to dst file
|
||||
err = os.Rename(filePath, xms.dstFilePath)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to rename tmp file to dst file: %+v", err)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
for _, label := range labels {
|
||||
klog.V(2).Infof("%v\n", label)
|
||||
}
|
||||
}
|
||||
|
||||
// compareLabels returns true, if the labels at dstFilePath are equal to given labels.
|
||||
func (xms *xpuManagerSidecar) compareLabels(labels []string) bool {
|
||||
file, err := os.OpenFile(xms.dstFilePath, os.O_RDONLY, 0644)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
fileLabels := []string{}
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
fileLabels = append(fileLabels, scanner.Text())
|
||||
}
|
||||
|
||||
return reflect.DeepEqual(fileLabels, labels)
|
||||
}
|
||||
|
||||
func createInvalidTopologyCell() xpuManagerTopologyMatrixCell {
|
||||
cell := xpuManagerTopologyMatrixCell{}
|
||||
|
||||
cell.LaneCount = -1
|
||||
cell.LocalDeviceID = -1
|
||||
cell.LocalSubdeviceID = -1
|
||||
cell.RemoteDeviceID = -1
|
||||
cell.RemoteSubdeviceID = -1
|
||||
|
||||
return cell
|
||||
}
|
||||
|
||||
func isValidTopologyCell(cell *xpuManagerTopologyMatrixCell) bool {
|
||||
return (cell.LaneCount >= 0 && cell.LocalDeviceID >= 0 &&
|
||||
cell.LocalSubdeviceID >= 0 && cell.RemoteDeviceID >= 0 &&
|
||||
cell.RemoteSubdeviceID >= 0)
|
||||
}
|
||||
|
||||
func (xms *xpuManagerSidecar) createLabels(topologyInfos []xpuManagerTopologyMatrixCell) []string {
|
||||
links := ""
|
||||
separator := ""
|
||||
|
||||
submitted := map[string]int{}
|
||||
|
||||
cellToString := func(cell xpuManagerTopologyMatrixCell) string {
|
||||
if cell.LocalDeviceID < cell.RemoteDeviceID {
|
||||
return strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID) + "-" +
|
||||
strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID)
|
||||
}
|
||||
|
||||
return strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID) + "-" +
|
||||
strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID)
|
||||
}
|
||||
|
||||
for _, ti := range topologyInfos {
|
||||
if ti.LaneCount < int(xms.laneCount) {
|
||||
continue
|
||||
}
|
||||
|
||||
linkString := cellToString(ti)
|
||||
|
||||
count, found := submitted[linkString]
|
||||
if !found {
|
||||
links += separator + linkString
|
||||
separator = "_"
|
||||
}
|
||||
|
||||
count++
|
||||
|
||||
if count > 2 {
|
||||
klog.Warningf("Duplicate links found for: %s (lane count: %d)", linkString, ti.LaneCount)
|
||||
}
|
||||
|
||||
submitted[linkString] = count
|
||||
}
|
||||
|
||||
splitLinks := pluginutils.SplitAtLastAlphaNum(links, labelMaxLength, labelControlChar)
|
||||
|
||||
labels := []string{}
|
||||
|
||||
if len(splitLinks) == 0 {
|
||||
return labels
|
||||
}
|
||||
|
||||
labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+"="+splitLinks[0])
|
||||
for i := 1; i < len(splitLinks); i++ {
|
||||
labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+strconv.FormatInt(int64(i+1), 10)+"="+splitLinks[i])
|
||||
}
|
||||
|
||||
return labels
|
||||
}
|
||||
|
||||
func createXPUManagerSidecar() *xpuManagerSidecar {
|
||||
xms := xpuManagerSidecar{}
|
||||
|
||||
xms.getMetricsData = xms.getMetricsDataFromXPUM
|
||||
|
||||
return &xms
|
||||
}
|
||||
|
||||
func main() {
|
||||
xms := createXPUManagerSidecar()
|
||||
|
||||
flag.Uint64Var(&xms.interval, "interval", 10, "interval for topology fetching and label writing (seconds, >= 1)")
|
||||
flag.Uint64Var(&xms.startDelay, "startup-delay", 10, "startup delay for first topology fetching and label writing (seconds, >= 0)")
|
||||
flag.Uint64Var(&xms.xpumPort, "xpum-port", 29999, "xpumanager port number")
|
||||
flag.StringVar(&xms.tmpDirPrefix, "tmp-dir-prefix", "/etc/kubernetes/node-feature-discovery/features.d/", "location prefix for a temporary directory (used to store in-flight label file)")
|
||||
flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination")
|
||||
flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink")
|
||||
flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels")
|
||||
klog.InitFlags(nil)
|
||||
|
||||
flag.Parse()
|
||||
|
||||
if xms.interval == 0 {
|
||||
klog.Fatal("zero interval won't work, set it to at least 1")
|
||||
}
|
||||
|
||||
xms.url = fmt.Sprintf("http://127.0.0.1:%d/metrics", xms.xpumPort)
|
||||
|
||||
keepIterating := true
|
||||
|
||||
c := make(chan os.Signal, 1)
|
||||
signal.Notify(c, os.Interrupt)
|
||||
signal.Notify(c, syscall.SIGTERM)
|
||||
|
||||
time.Sleep(time.Duration(xms.startDelay) * time.Second)
|
||||
|
||||
for keepIterating {
|
||||
xms.iterate()
|
||||
|
||||
timeout := time.After(time.Duration(xms.interval) * time.Second)
|
||||
|
||||
select {
|
||||
case <-timeout:
|
||||
continue
|
||||
case <-c:
|
||||
klog.V(2).Info("Interrupt received")
|
||||
|
||||
keepIterating = false
|
||||
}
|
||||
}
|
||||
|
||||
klog.V(2).Info("Removing label file")
|
||||
|
||||
err := os.Remove(xms.dstFilePath)
|
||||
if err != nil {
|
||||
klog.Errorf("Failed to cleanup label file: %+v", err)
|
||||
}
|
||||
|
||||
klog.V(2).Info("Stopping sidecar")
|
||||
}
|
243
cmd/xpumanager_sidecar/main_test.go
Normal file
243
cmd/xpumanager_sidecar/main_test.go
Normal file
@ -0,0 +1,243 @@
|
||||
// Copyright 2022 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
name string
|
||||
metricsData []string
|
||||
expectedLabels []string
|
||||
minLaneCount int
|
||||
}
|
||||
|
||||
func createTestCases() []testCase {
|
||||
return []testCase{
|
||||
{
|
||||
name: "Garbage metrics",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`xpum_some_other_data{with_some_label]]]]}`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||
},
|
||||
{
|
||||
name: "No xelinks reported",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_some_other_data{with_some_label="foo"} 42`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 4`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||
},
|
||||
{
|
||||
name: "Xelinks not on sub devices",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||
},
|
||||
{
|
||||
name: "Xelinks without lan counts",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1.0`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links="},
|
||||
},
|
||||
{
|
||||
name: "One xelink",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"},
|
||||
},
|
||||
{
|
||||
name: "One xelink with non xelink",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{local_device_id="99",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"},
|
||||
},
|
||||
{
|
||||
name: "Cross linked subdevs",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
`xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="1",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.1_0.1-1.0"},
|
||||
},
|
||||
{
|
||||
name: "One to many",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="2", lane_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-2.2_0.0-3.0"},
|
||||
},
|
||||
{
|
||||
name: "Many to many",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1"},
|
||||
},
|
||||
{
|
||||
name: "Too few lanes",
|
||||
minLaneCount: 8,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{"xpumanager.intel.com/xe-links=1.0-3.0_0.1-3.1"},
|
||||
},
|
||||
{
|
||||
name: "Multi line label",
|
||||
minLaneCount: 4,
|
||||
metricsData: []string{
|
||||
`# HELP xpum_topology_link Connection type fo two GPU tiles`,
|
||||
`# TYPE xpum_topology_link gauge`,
|
||||
`xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
|
||||
`xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
|
||||
`xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
|
||||
`xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`,
|
||||
`xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`,
|
||||
`xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`,
|
||||
|
||||
"",
|
||||
},
|
||||
expectedLabels: []string{
|
||||
"xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1_2.0-4.0_3.0-4.1_0.1-5.0_1.1-5.1",
|
||||
"xpumanager.intel.com/xe-links2=Z_2.0-6.0_3.0-6.1_0.1-7.0_1.1-7.1_2.0-8.0_3.0-8.1_0.1-9.0_1.1-9",
|
||||
"xpumanager.intel.com/xe-links3=Z.1",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (tc *testCase) createFakeXMS(data []string, minLaneCount int) *xpuManagerSidecar {
|
||||
bytes := []byte(strings.Join(data, "\n"))
|
||||
|
||||
metricsGetter := func() []byte {
|
||||
return bytes
|
||||
}
|
||||
|
||||
xms := createXPUManagerSidecar()
|
||||
xms.getMetricsData = metricsGetter
|
||||
xms.laneCount = uint64(minLaneCount)
|
||||
xms.labelNamespace = "xpumanager.intel.com"
|
||||
|
||||
return xms
|
||||
}
|
||||
|
||||
func TestLabeling(t *testing.T) {
|
||||
tcs := createTestCases()
|
||||
|
||||
for _, tc := range tcs {
|
||||
print("Testcase (labeling): ", tc.name, "\n")
|
||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||
topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n")))
|
||||
|
||||
labels := xms.createLabels(topologyInfos)
|
||||
if !reflect.DeepEqual(labels, tc.expectedLabels) {
|
||||
t.Errorf("got %v expected %v\n", labels, tc.expectedLabels)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestIterate(t *testing.T) {
|
||||
tcs := createTestCases()
|
||||
|
||||
for _, tc := range tcs {
|
||||
print("Testcase (iterate): ", tc.name, "\n")
|
||||
xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount)
|
||||
|
||||
root, err := os.MkdirTemp("", "test_new_xms")
|
||||
if err != nil {
|
||||
t.Fatalf("can't create temporary directory: %+v", err)
|
||||
}
|
||||
// dirs/files need to be removed for the next test
|
||||
defer os.RemoveAll(root)
|
||||
|
||||
xms.tmpDirPrefix = root
|
||||
xms.dstFilePath = filepath.Join(root, "labels.txt")
|
||||
|
||||
xms.iterate()
|
||||
|
||||
if !xms.compareLabels(tc.expectedLabels) {
|
||||
t.Errorf("output file didn't have expected labels\n")
|
||||
}
|
||||
}
|
||||
}
|
34
deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml
Normal file
34
deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
labels:
|
||||
app: intel-xpumanager
|
||||
name: intel-xpumanager
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
volumes:
|
||||
- name: features-d
|
||||
hostPath:
|
||||
path: "/etc/kubernetes/node-feature-discovery/features.d/"
|
||||
containers:
|
||||
- name: xelink-sidecar
|
||||
image: intel/intel-xpumanager-sidecar:devel
|
||||
imagePullPolicy: Always
|
||||
args:
|
||||
- -v=2
|
||||
volumeMounts:
|
||||
- name: features-d
|
||||
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsUser: 0
|
||||
- name: xpumd
|
||||
resources:
|
||||
limits:
|
||||
$patch: replace
|
||||
gpu.intel.com/i915_monitoring: 1
|
6
deployments/xpumanager_sidecar/kustomization.yaml
Normal file
6
deployments/xpumanager_sidecar/kustomization.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
resources:
|
||||
# XeLink topology information is only available from >= 1.x.y release
|
||||
- https://raw.githubusercontent.com/intel/xpumanager/v1.2.0_golden/deployment/kubernetes/daemonset-intel-xpum.yaml
|
||||
namespace: monitoring
|
||||
patchesStrategicMerge:
|
||||
- kustom/kustom_xpumanager.yaml
|
4
go.mod
4
go.mod
@ -12,6 +12,8 @@ require (
|
||||
github.com/onsi/ginkgo/v2 v2.6.0
|
||||
github.com/onsi/gomega v1.24.1
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/prometheus/client_model v0.3.0
|
||||
github.com/prometheus/common v0.37.0
|
||||
golang.org/x/sys v0.4.0
|
||||
golang.org/x/text v0.6.0
|
||||
google.golang.org/grpc v1.51.0
|
||||
@ -63,8 +65,6 @@ require (
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/prometheus/client_golang v1.14.0 // indirect
|
||||
github.com/prometheus/client_model v0.3.0 // indirect
|
||||
github.com/prometheus/common v0.37.0 // indirect
|
||||
github.com/prometheus/procfs v0.8.0 // indirect
|
||||
github.com/spf13/cobra v1.6.0 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
|
Loading…
Reference in New Issue
Block a user