diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2c090d00..66cdd589 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -122,6 +122,7 @@ jobs: - intel-idxd-config-initcontainer - intel-dlb-plugin - intel-dlb-initcontainer + - intel-xpumanager-sidecar # Demo images - crypto-perf diff --git a/build/docker/intel-xpumanager-sidecar.Dockerfile b/build/docker/intel-xpumanager-sidecar.Dockerfile new file mode 100644 index 00000000..800ba632 --- /dev/null +++ b/build/docker/intel-xpumanager-sidecar.Dockerfile @@ -0,0 +1,63 @@ +## This is a generated file, do not edit directly. Edit build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in instead. +## +## Copyright 2022 Intel Corporation. All Rights Reserved. +## +## Licensed under the Apache License, Version 2.0 (the "License"); +## you may not use this file except in compliance with the License. +## You may obtain a copy of the License at +## +## http://www.apache.org/licenses/LICENSE-2.0 +## +## Unless required by applicable law or agreed to in writing, software +## distributed under the License is distributed on an "AS IS" BASIS, +## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +## See the License for the specific language governing permissions and +## limitations under the License. +### +ARG CMD=xpumanager_sidecar +## FINAL_BASE can be used to configure the base image of the final image. +## +## This is used in two ways: +## 1) make BUILDER= +## 2) docker build ... -f .Dockerfile +## +## The project default is 1) which sets FINAL_BASE=gcr.io/distroless/static +## (see build-image.sh). +## 2) and the default FINAL_BASE is primarily used to build Redhat Certified Openshift Operator container images that must be UBI based. +## The RedHat build tool does not allow additional image build parameters. +ARG FINAL_BASE=registry.access.redhat.com/ubi8-micro:latest +### +## +## GOLANG_BASE can be used to make the build reproducible by choosing an +## image by its hash: +## GOLANG_BASE=golang@sha256:9d64369fd3c633df71d7465d67d43f63bb31192193e671742fa1c26ebc3a6210 +## +## This is used on release branches before tagging a stable version. +## The main branch defaults to using the latest Golang base image. +ARG GOLANG_BASE=golang:1.19-bullseye +### +FROM ${GOLANG_BASE} as builder +ARG DIR=/intel-device-plugins-for-kubernetes +ARG GO111MODULE=on +ARG BUILDFLAGS="-ldflags=-w -s" +ARG GOLICENSES_VERSION +ARG EP=/usr/local/bin/intel_xpumanager_sidecar +ARG CMD +WORKDIR ${DIR} +COPY . . +RUN (cd cmd/${CMD}; GO111MODULE=${GO111MODULE} CGO_ENABLED=0 go install "${BUILDFLAGS}") && install -D /go/bin/${CMD} /install_root${EP} +RUN install -D ${DIR}/LICENSE /install_root/licenses/intel-device-plugins-for-kubernetes/LICENSE \ + && if [ ! -d "licenses/$CMD" ] ; then \ + GO111MODULE=on go run github.com/google/go-licenses@${GOLICENSES_VERSION} save "./cmd/$CMD" \ + --save_path /install_root/licenses/$CMD/go-licenses ; \ + else mkdir -p /install_root/licenses/$CMD/go-licenses/ && cd licenses/$CMD && cp -r * /install_root/licenses/$CMD/go-licenses/ ; fi +### +FROM ${FINAL_BASE} +COPY --from=builder /install_root / +ENTRYPOINT ["/usr/local/bin/intel_xpumanager_sidecar"] +LABEL vendor='Intel®' +LABEL version='devel' +LABEL release='1' +LABEL name='intel-xpumanager-sidecar' +LABEL summary='Intel® xpumanager sidecar' +LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information' diff --git a/build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in b/build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in new file mode 100644 index 00000000..0a3f8b30 --- /dev/null +++ b/build/docker/templates/intel-xpumanager-sidecar.Dockerfile.in @@ -0,0 +1,8 @@ +#define _ENTRYPOINT_ /usr/local/bin/intel_xpumanager_sidecar +ARG CMD=xpumanager_sidecar + +#include "default_plugin.docker" + +LABEL name='intel-xpumanager-sidecar' +LABEL summary='Intel® xpumanager sidecar' +LABEL description='The xpumanager sidecar creates NFD labels from xpumanager topology information' diff --git a/cmd/xpumanager_sidecar/main.go b/cmd/xpumanager_sidecar/main.go new file mode 100644 index 00000000..bf608f6f --- /dev/null +++ b/cmd/xpumanager_sidecar/main.go @@ -0,0 +1,411 @@ +// Copyright 2022 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "bufio" + "bytes" + "context" + "flag" + "fmt" + "io" + "net/http" + "os" + "os/signal" + "path" + "reflect" + "strconv" + "syscall" + "time" + + "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils" + "k8s.io/klog/v2" + + io_prometheus_client "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" +) + +type invalidEntryErr struct{} + +const ( + labelMaxLength = 63 + xeLinkLabelName = "xe-links" + pureXeLinkMetricValue = 1 + labelControlChar = "Z" +) + +type xpuManagerTopologyMatrixCell struct { + LocalDeviceID int + LocalSubdeviceID int + RemoteDeviceID int + RemoteSubdeviceID int + LaneCount int +} + +type xpuManagerSidecar struct { + getMetricsData func() []byte + tmpDirPrefix string + dstFilePath string + labelNamespace string + url string + interval uint64 + startDelay uint64 + xpumPort uint64 + laneCount uint64 +} + +func (e *invalidEntryErr) Error() string { + return "metrics entry was invalid for our use" +} + +func (xms *xpuManagerSidecar) getMetricsDataFromXPUM() []byte { + client := &http.Client{ + Timeout: 5 * time.Second, + } + + ctx := context.Background() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, xms.url, http.NoBody) + if err != nil { + klog.Error(err.Error()) + + return nil + } + + res, err := client.Do(req) + if err != nil { + klog.Error(err.Error()) + + return nil + } + + resBody, err := io.ReadAll(res.Body) + + defer res.Body.Close() + + if err != nil { + klog.Error(err.Error()) + + return nil + } + + // Seems /metrics doesn't add new-line at the end of the last entry + // and without this there is an error from TextParser + resBody = append(resBody, "\n"...) + + return resBody +} + +func processMetricsLabels(labels []*io_prometheus_client.LabelPair) (xpuManagerTopologyMatrixCell, error) { + cell := createInvalidTopologyCell() + + for _, label := range labels { + name := label.GetName() + strVal := label.GetValue() + + klog.V(5).Info(name, " ", strVal) + + // xelinks should always be on subdevices + if name == "local_on_subdevice" && strVal != "true" { + return cell, &invalidEntryErr{} + } + + valInt, err := strconv.Atoi(strVal) + if err != nil { + continue + } + + switch name { + case "local_device_id": + cell.LocalDeviceID = valInt + case "local_subdevice_id": + cell.LocalSubdeviceID = valInt + case "remote_device_id": + cell.RemoteDeviceID = valInt + case "remote_subdevice_id": + cell.RemoteSubdeviceID = valInt + case "lane_count": + fallthrough + case "lan_count": + cell.LaneCount = valInt + } + } + + if !isValidTopologyCell(&cell) { + return cell, &invalidEntryErr{} + } + + return cell, nil +} + +func (xms *xpuManagerSidecar) GetTopologyFromXPUMMetrics(data []byte) (topologyInfos []xpuManagerTopologyMatrixCell) { + reader := bytes.NewReader(data) + + var parser expfmt.TextParser + families, err := parser.TextToMetricFamilies(reader) + + if err != nil { + klog.Error(err.Error()) + + return nil + } + + for name, family := range families { + klog.V(4).Info("parsing family: " + name) + + if name != "xpum_topology_link" { + klog.V(5).Info("... skipping") + + continue + } + + for _, metric := range family.Metric { + value := -1.0 + + klog.V(5).Info(metric) + + if metric.Gauge != nil { + klog.V(5).Info("metric is of type gauge") + + value = *metric.Gauge.Value + } else if metric.Untyped != nil { + klog.V(5).Info("metric is of type untyped") + value = *metric.Untyped.Value + } else { + klog.Warningf("Unknown/unsupported metric type: %v", metric) + } + + if value != pureXeLinkMetricValue { + klog.V(5).Info("... not xelink") + + continue + } + + cell, err := processMetricsLabels(metric.Label) + if err == nil { + klog.V(5).Info("topology entry: ", cell) + topologyInfos = append(topologyInfos, cell) + } + } + } + + klog.V(5).Info("topology entries: ", len(topologyInfos)) + + return topologyInfos +} + +func (xms *xpuManagerSidecar) iterate() { + metricsData := xms.getMetricsData() + topologyInfos := xms.GetTopologyFromXPUMMetrics(metricsData) + + labels := xms.createLabels(topologyInfos) + + if !xms.compareLabels(labels) { + xms.writeLabels(labels) + } else { + klog.V(2).Info("labels have not changed") + } +} + +// TODO: Move this function under internal/pluginutils. +func (xms *xpuManagerSidecar) writeLabels(labels []string) { + root, err := os.MkdirTemp(xms.tmpDirPrefix, "xpumsidecar") + if err != nil { + klog.Errorf("can't create temporary directory: %+v", err) + + return + } + + defer os.RemoveAll(root) + + filePath := path.Join(root, "xpum-sidecar-labels.txt") + + file, err := os.OpenFile(filePath, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + + if err != nil { + klog.Error(err.Error()) + } + + writer := bufio.NewWriter(file) + + for _, label := range labels { + _, _ = writer.WriteString(label + "\n") + } + + writer.Flush() + file.Close() + + // move tmp file to dst file + err = os.Rename(filePath, xms.dstFilePath) + if err != nil { + klog.Errorf("Failed to rename tmp file to dst file: %+v", err) + + return + } + + for _, label := range labels { + klog.V(2).Infof("%v\n", label) + } +} + +// compareLabels returns true, if the labels at dstFilePath are equal to given labels. +func (xms *xpuManagerSidecar) compareLabels(labels []string) bool { + file, err := os.OpenFile(xms.dstFilePath, os.O_RDONLY, 0644) + if err != nil { + return false + } + + fileLabels := []string{} + scanner := bufio.NewScanner(file) + + for scanner.Scan() { + fileLabels = append(fileLabels, scanner.Text()) + } + + return reflect.DeepEqual(fileLabels, labels) +} + +func createInvalidTopologyCell() xpuManagerTopologyMatrixCell { + cell := xpuManagerTopologyMatrixCell{} + + cell.LaneCount = -1 + cell.LocalDeviceID = -1 + cell.LocalSubdeviceID = -1 + cell.RemoteDeviceID = -1 + cell.RemoteSubdeviceID = -1 + + return cell +} + +func isValidTopologyCell(cell *xpuManagerTopologyMatrixCell) bool { + return (cell.LaneCount >= 0 && cell.LocalDeviceID >= 0 && + cell.LocalSubdeviceID >= 0 && cell.RemoteDeviceID >= 0 && + cell.RemoteSubdeviceID >= 0) +} + +func (xms *xpuManagerSidecar) createLabels(topologyInfos []xpuManagerTopologyMatrixCell) []string { + links := "" + separator := "" + + submitted := map[string]int{} + + cellToString := func(cell xpuManagerTopologyMatrixCell) string { + if cell.LocalDeviceID < cell.RemoteDeviceID { + return strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID) + "-" + + strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID) + } + + return strconv.Itoa(cell.RemoteDeviceID) + "." + strconv.Itoa(cell.RemoteSubdeviceID) + "-" + + strconv.Itoa(cell.LocalDeviceID) + "." + strconv.Itoa(cell.LocalSubdeviceID) + } + + for _, ti := range topologyInfos { + if ti.LaneCount < int(xms.laneCount) { + continue + } + + linkString := cellToString(ti) + + count, found := submitted[linkString] + if !found { + links += separator + linkString + separator = "_" + } + + count++ + + if count > 2 { + klog.Warningf("Duplicate links found for: %s (lane count: %d)", linkString, ti.LaneCount) + } + + submitted[linkString] = count + } + + splitLinks := pluginutils.SplitAtLastAlphaNum(links, labelMaxLength, labelControlChar) + + labels := []string{} + + if len(splitLinks) == 0 { + return labels + } + + labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+"="+splitLinks[0]) + for i := 1; i < len(splitLinks); i++ { + labels = append(labels, xms.labelNamespace+"/"+xeLinkLabelName+strconv.FormatInt(int64(i+1), 10)+"="+splitLinks[i]) + } + + return labels +} + +func createXPUManagerSidecar() *xpuManagerSidecar { + xms := xpuManagerSidecar{} + + xms.getMetricsData = xms.getMetricsDataFromXPUM + + return &xms +} + +func main() { + xms := createXPUManagerSidecar() + + flag.Uint64Var(&xms.interval, "interval", 10, "interval for topology fetching and label writing (seconds, >= 1)") + flag.Uint64Var(&xms.startDelay, "startup-delay", 10, "startup delay for first topology fetching and label writing (seconds, >= 0)") + flag.Uint64Var(&xms.xpumPort, "xpum-port", 29999, "xpumanager port number") + flag.StringVar(&xms.tmpDirPrefix, "tmp-dir-prefix", "/etc/kubernetes/node-feature-discovery/features.d/", "location prefix for a temporary directory (used to store in-flight label file)") + flag.StringVar(&xms.dstFilePath, "dst-file-path", "/etc/kubernetes/node-feature-discovery/features.d/xpum-sidecar-labels.txt", "label file destination") + flag.Uint64Var(&xms.laneCount, "lane-count", 4, "minimum lane count for xelink") + flag.StringVar(&xms.labelNamespace, "label-namespace", "gpu.intel.com", "namespace for the labels") + klog.InitFlags(nil) + + flag.Parse() + + if xms.interval == 0 { + klog.Fatal("zero interval won't work, set it to at least 1") + } + + xms.url = fmt.Sprintf("http://127.0.0.1:%d/metrics", xms.xpumPort) + + keepIterating := true + + c := make(chan os.Signal, 1) + signal.Notify(c, os.Interrupt) + signal.Notify(c, syscall.SIGTERM) + + time.Sleep(time.Duration(xms.startDelay) * time.Second) + + for keepIterating { + xms.iterate() + + timeout := time.After(time.Duration(xms.interval) * time.Second) + + select { + case <-timeout: + continue + case <-c: + klog.V(2).Info("Interrupt received") + + keepIterating = false + } + } + + klog.V(2).Info("Removing label file") + + err := os.Remove(xms.dstFilePath) + if err != nil { + klog.Errorf("Failed to cleanup label file: %+v", err) + } + + klog.V(2).Info("Stopping sidecar") +} diff --git a/cmd/xpumanager_sidecar/main_test.go b/cmd/xpumanager_sidecar/main_test.go new file mode 100644 index 00000000..6c8785d1 --- /dev/null +++ b/cmd/xpumanager_sidecar/main_test.go @@ -0,0 +1,243 @@ +// Copyright 2022 Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "reflect" + "strings" + "testing" +) + +type testCase struct { + name string + metricsData []string + expectedLabels []string + minLaneCount int +} + +func createTestCases() []testCase { + return []testCase{ + { + name: "Garbage metrics", + minLaneCount: 4, + metricsData: []string{ + `xpum_some_other_data{with_some_label]]]]}`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links="}, + }, + { + name: "No xelinks reported", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_some_other_data{with_some_label="foo"} 42`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 4`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links="}, + }, + { + name: "Xelinks not on sub devices", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links="}, + }, + { + name: "Xelinks without lan counts", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 1.0`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links="}, + }, + { + name: "One xelink", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"}, + }, + { + name: "One xelink with non xelink", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{local_device_id="99",local_on_subdevice="false",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="0"} 0`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1.0`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0"}, + }, + { + name: "Cross linked subdevs", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="0",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + `xpum_topology_link{dev_file="card1",dev_name="Intel(R) Graphics [0x0bdb]",pci_bdf="0000:51:00.0",pci_dev="0xbdb",src="direct",uuid="01000000-0000-0000-0000-000000510000",vendor="Intel(R) Corporation",local_cpu_affinity="0-23,48-71",local_device_id="1",local_numa_index="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.1_0.1-1.0"}, + }, + { + name: "One to many", + minLaneCount: 4, + metricsData: []string{ + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="1",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="2", lane_count="4"} 1`, + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-1.0_0.0-2.2_0.0-3.0"}, + }, + { + name: "Many to many", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1"}, + }, + { + name: "Too few lanes", + minLaneCount: 8, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + "", + }, + expectedLabels: []string{"xpumanager.intel.com/xe-links=1.0-3.0_0.1-3.1"}, + }, + { + name: "Multi line label", + minLaneCount: 4, + metricsData: []string{ + `# HELP xpum_topology_link Connection type fo two GPU tiles`, + `# TYPE xpum_topology_link gauge`, + `xpum_topology_link{local_device_id="0",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="1",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="3",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="2",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + + `xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="4",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="5",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + + `xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="6",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="7",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + + `xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="2",remote_subdevice_id="0", lan_count="4"} 1`, + `xpum_topology_link{local_device_id="8",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="3",remote_subdevice_id="0", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="0",remote_device_id="0",remote_subdevice_id="1", lan_count="8"} 1`, + `xpum_topology_link{local_device_id="9",local_on_subdevice="true",local_subdevice_id="1",remote_device_id="1",remote_subdevice_id="1", lan_count="4"} 1`, + + "", + }, + expectedLabels: []string{ + "xpumanager.intel.com/xe-links=0.0-2.0_1.0-3.0_0.1-3.1_1.1-2.1_2.0-4.0_3.0-4.1_0.1-5.0_1.1-5.1", + "xpumanager.intel.com/xe-links2=Z_2.0-6.0_3.0-6.1_0.1-7.0_1.1-7.1_2.0-8.0_3.0-8.1_0.1-9.0_1.1-9", + "xpumanager.intel.com/xe-links3=Z.1", + }, + }, + } +} + +func (tc *testCase) createFakeXMS(data []string, minLaneCount int) *xpuManagerSidecar { + bytes := []byte(strings.Join(data, "\n")) + + metricsGetter := func() []byte { + return bytes + } + + xms := createXPUManagerSidecar() + xms.getMetricsData = metricsGetter + xms.laneCount = uint64(minLaneCount) + xms.labelNamespace = "xpumanager.intel.com" + + return xms +} + +func TestLabeling(t *testing.T) { + tcs := createTestCases() + + for _, tc := range tcs { + print("Testcase (labeling): ", tc.name, "\n") + xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) + topologyInfos := xms.GetTopologyFromXPUMMetrics([]byte(strings.Join(tc.metricsData, "\n"))) + + labels := xms.createLabels(topologyInfos) + if !reflect.DeepEqual(labels, tc.expectedLabels) { + t.Errorf("got %v expected %v\n", labels, tc.expectedLabels) + } + } +} + +func TestIterate(t *testing.T) { + tcs := createTestCases() + + for _, tc := range tcs { + print("Testcase (iterate): ", tc.name, "\n") + xms := tc.createFakeXMS(tc.metricsData, tc.minLaneCount) + + root, err := os.MkdirTemp("", "test_new_xms") + if err != nil { + t.Fatalf("can't create temporary directory: %+v", err) + } + // dirs/files need to be removed for the next test + defer os.RemoveAll(root) + + xms.tmpDirPrefix = root + xms.dstFilePath = filepath.Join(root, "labels.txt") + + xms.iterate() + + if !xms.compareLabels(tc.expectedLabels) { + t.Errorf("output file didn't have expected labels\n") + } + } +} diff --git a/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml b/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml new file mode 100644 index 00000000..69acf589 --- /dev/null +++ b/deployments/xpumanager_sidecar/kustom/kustom_xpumanager.yaml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: intel-xpumanager + name: intel-xpumanager +spec: + template: + spec: + volumes: + - name: features-d + hostPath: + path: "/etc/kubernetes/node-feature-discovery/features.d/" + containers: + - name: xelink-sidecar + image: intel/intel-xpumanager-sidecar:devel + imagePullPolicy: Always + args: + - -v=2 + volumeMounts: + - name: features-d + mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 0 + - name: xpumd + resources: + limits: + $patch: replace + gpu.intel.com/i915_monitoring: 1 diff --git a/deployments/xpumanager_sidecar/kustomization.yaml b/deployments/xpumanager_sidecar/kustomization.yaml new file mode 100644 index 00000000..7e152fa9 --- /dev/null +++ b/deployments/xpumanager_sidecar/kustomization.yaml @@ -0,0 +1,6 @@ +resources: +# XeLink topology information is only available from >= 1.x.y release +- https://raw.githubusercontent.com/intel/xpumanager/v1.2.0_golden/deployment/kubernetes/daemonset-intel-xpum.yaml +namespace: monitoring +patchesStrategicMerge: +- kustom/kustom_xpumanager.yaml diff --git a/go.mod b/go.mod index b0d4334f..79b2c840 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,8 @@ require ( github.com/onsi/ginkgo/v2 v2.6.0 github.com/onsi/gomega v1.24.1 github.com/pkg/errors v0.9.1 + github.com/prometheus/client_model v0.3.0 + github.com/prometheus/common v0.37.0 golang.org/x/sys v0.4.0 golang.org/x/text v0.6.0 google.golang.org/grpc v1.51.0 @@ -63,8 +65,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/prometheus/client_golang v1.14.0 // indirect - github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.37.0 // indirect github.com/prometheus/procfs v0.8.0 // indirect github.com/spf13/cobra v1.6.0 // indirect github.com/spf13/pflag v1.0.5 // indirect