mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
502 lines
13 KiB
Go
502 lines
13 KiB
Go
// Copyright 2020-2023 Intel Corporation. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package labeler
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"os/signal"
|
|
"path"
|
|
"path/filepath"
|
|
"reflect"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
|
|
"github.com/pkg/errors"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
const (
|
|
labelNamespace = "gpu.intel.com/"
|
|
gpuListLabelName = "cards"
|
|
gpuNumListLabelName = "gpu-numbers"
|
|
millicoreLabelName = "millicores"
|
|
pciGroupLabelName = "pci-groups"
|
|
tilesLabelName = "tiles"
|
|
numaMappingName = "numa-gpu-map"
|
|
millicoresPerGPU = 1000
|
|
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
|
memoryReservedEnv = "GPU_MEMORY_RESERVED"
|
|
pciGroupingEnv = "GPU_PCI_GROUPING_LEVEL"
|
|
gpuDeviceRE = `^card[0-9]+$`
|
|
controlDeviceRE = `^controlD[0-9]+$`
|
|
vendorString = "0x8086"
|
|
labelMaxLength = 63
|
|
labelControlChar = "Z"
|
|
)
|
|
|
|
type labelMap map[string]string
|
|
|
|
type labeler struct {
|
|
gpuDeviceReg *regexp.Regexp
|
|
controlDeviceReg *regexp.Regexp
|
|
labels labelMap
|
|
|
|
sysfsDRMDir string
|
|
labelsChanged bool
|
|
}
|
|
|
|
func newLabeler(sysfsDRMDir string) *labeler {
|
|
return &labeler{
|
|
sysfsDRMDir: sysfsDRMDir,
|
|
gpuDeviceReg: regexp.MustCompile(gpuDeviceRE),
|
|
controlDeviceReg: regexp.MustCompile(controlDeviceRE),
|
|
labels: labelMap{},
|
|
labelsChanged: true,
|
|
}
|
|
}
|
|
|
|
// getPCIPathParts returns a subPath from the given full path starting from folder with prefix "pci".
|
|
// returns "" in case not enough folders are found after the one starting with "pci".
|
|
func getPCIPathParts(numFolders uint64, fullPath string) string {
|
|
parts := strings.Split(fullPath, "/")
|
|
|
|
if len(parts) == 1 {
|
|
return ""
|
|
}
|
|
|
|
foundPci := false
|
|
subPath := ""
|
|
separator := ""
|
|
|
|
for _, part := range parts {
|
|
if !foundPci && strings.HasPrefix(part, "pci") {
|
|
foundPci = true
|
|
}
|
|
|
|
if foundPci && numFolders > 0 {
|
|
subPath = subPath + separator + part
|
|
separator = "/"
|
|
numFolders--
|
|
}
|
|
|
|
if numFolders == 0 {
|
|
return subPath
|
|
}
|
|
}
|
|
|
|
return ""
|
|
}
|
|
|
|
func (l *labeler) scan() ([]string, error) {
|
|
files, err := os.ReadDir(l.sysfsDRMDir)
|
|
gpuNameList := []string{}
|
|
|
|
if err != nil {
|
|
return gpuNameList, errors.Wrap(err, "Can't read sysfs folder")
|
|
}
|
|
|
|
for _, f := range files {
|
|
if !l.gpuDeviceReg.MatchString(f.Name()) {
|
|
klog.V(4).Info("Not compatible device", f.Name())
|
|
continue
|
|
}
|
|
|
|
dat, err := os.ReadFile(path.Join(l.sysfsDRMDir, f.Name(), "device/vendor"))
|
|
if err != nil {
|
|
klog.Warning("Skipping. Can't read vendor file: ", err)
|
|
continue
|
|
}
|
|
|
|
if strings.TrimSpace(string(dat)) != vendorString {
|
|
klog.V(4).Info("Non-Intel GPU", f.Name())
|
|
continue
|
|
}
|
|
|
|
if pluginutils.IsSriovPFwithVFs(path.Join(l.sysfsDRMDir, f.Name())) {
|
|
klog.V(4).Infof("Skipping PF with VF")
|
|
continue
|
|
}
|
|
|
|
_, err = os.ReadDir(path.Join(l.sysfsDRMDir, f.Name(), "device/drm"))
|
|
if err != nil {
|
|
return gpuNameList, errors.Wrap(err, "Can't read device folder")
|
|
}
|
|
|
|
gpuNameList = append(gpuNameList, f.Name())
|
|
}
|
|
|
|
return gpuNameList, nil
|
|
}
|
|
|
|
func getEnvVarNumber(envVarName string) uint64 {
|
|
envValue := os.Getenv(envVarName)
|
|
if envValue != "" {
|
|
val, err := strconv.ParseUint(envValue, 10, 64)
|
|
if err == nil {
|
|
return val
|
|
}
|
|
}
|
|
|
|
return 0
|
|
}
|
|
|
|
func fallback() uint64 {
|
|
return getEnvVarNumber(memoryOverrideEnv)
|
|
}
|
|
|
|
func GetMemoryAmount(sysfsDrmDir, gpuName string, numTiles uint64) uint64 {
|
|
reserved := getEnvVarNumber(memoryReservedEnv)
|
|
|
|
filePath := filepath.Join(sysfsDrmDir, gpuName, "lmem_total_bytes")
|
|
|
|
dat, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
klog.Warning("Can't read file: ", err)
|
|
return fallback()
|
|
}
|
|
|
|
totalPerTile, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
|
if err != nil {
|
|
klog.Warning("Can't convert lmem_total_bytes: ", err)
|
|
return fallback()
|
|
}
|
|
|
|
return totalPerTile*numTiles - reserved
|
|
}
|
|
|
|
// GetTileCount reads the tile count.
|
|
func GetTileCount(sysfsDrmDir, gpuName string) (numTiles uint64) {
|
|
filePath := filepath.Join(sysfsDrmDir, gpuName, "gt/gt*")
|
|
|
|
files, _ := filepath.Glob(filePath)
|
|
|
|
if len(files) == 0 {
|
|
return 1
|
|
}
|
|
|
|
return uint64(len(files))
|
|
}
|
|
|
|
// GetNumaNode reads the cards numa node.
|
|
func GetNumaNode(sysfsDrmDir, gpuName string) int {
|
|
filePath := filepath.Join(sysfsDrmDir, gpuName, "device/numa_node")
|
|
|
|
data, err := os.ReadFile(filePath)
|
|
if err != nil {
|
|
klog.Warning("Can't read file: ", err)
|
|
return -1
|
|
}
|
|
|
|
numa, err := strconv.ParseInt(strings.TrimSpace(string(data)), 10, 32)
|
|
if err != nil {
|
|
klog.Warning("Can't convert numa_node: ", err)
|
|
return -1
|
|
}
|
|
|
|
if numa > math.MaxInt16 {
|
|
klog.Warning("Too large numa: ", numa)
|
|
|
|
return -1
|
|
}
|
|
|
|
return int(numa)
|
|
}
|
|
|
|
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
|
func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
|
|
value := int64(0)
|
|
if numstr, ok := lm[labelName]; ok {
|
|
_, _ = fmt.Sscanf(numstr, "%d", &value)
|
|
}
|
|
|
|
value += valueToAdd
|
|
lm[labelName] = strconv.FormatInt(value, 10)
|
|
}
|
|
|
|
// this returns pci groups label value, groups separated by "_", gpus separated by ".".
|
|
// Example for two groups with 4 gpus: "0.1.2.3_4.5.6.7".
|
|
func (l *labeler) createPCIGroupLabel(gpuNumList []string) string {
|
|
pciGroups := map[string][]string{}
|
|
|
|
pciGroupLevel := getEnvVarNumber(pciGroupingEnv)
|
|
if pciGroupLevel == 0 {
|
|
return ""
|
|
}
|
|
|
|
for _, gpuNum := range gpuNumList {
|
|
symLinkTarget, err := filepath.EvalSymlinks(path.Join(l.sysfsDRMDir, "card"+gpuNum))
|
|
|
|
if err == nil {
|
|
if pathPart := getPCIPathParts(pciGroupLevel, symLinkTarget); pathPart != "" {
|
|
pciGroups[pathPart] = append(pciGroups[pathPart], gpuNum)
|
|
}
|
|
}
|
|
}
|
|
|
|
labelValue := ""
|
|
separator := ""
|
|
|
|
// process in stable order by sorting
|
|
keys := []string{}
|
|
for key := range pciGroups {
|
|
keys = append(keys, key)
|
|
}
|
|
|
|
sort.Strings(keys)
|
|
|
|
for _, key := range keys {
|
|
labelValue = labelValue + separator + strings.Join(pciGroups[key], ".")
|
|
separator = "_"
|
|
}
|
|
|
|
return labelValue
|
|
}
|
|
|
|
// createLabels is the main function of plugin labeler, it creates label-value pairs for the gpus.
|
|
func (l *labeler) createLabels() error {
|
|
prevLabels := l.labels
|
|
|
|
l.labels = labelMap{}
|
|
|
|
gpuNameList, err := l.scan()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
gpuNumList := []string{}
|
|
tileCount := 0
|
|
|
|
numaMapping := make(map[int][]string)
|
|
|
|
for _, gpuName := range gpuNameList {
|
|
gpuNum := ""
|
|
// extract gpu number as a string. scan() has already checked name syntax
|
|
_, err = fmt.Sscanf(gpuName, "card%s", &gpuNum)
|
|
if err != nil {
|
|
return errors.Wrap(err, "gpu name parsing error")
|
|
}
|
|
|
|
numTiles := GetTileCount(l.sysfsDRMDir, gpuName)
|
|
tileCount += int(numTiles)
|
|
|
|
memoryAmount := GetMemoryAmount(l.sysfsDRMDir, gpuName, numTiles)
|
|
gpuNumList = append(gpuNumList, gpuName[4:])
|
|
|
|
// get numa node of the GPU
|
|
numaNode := GetNumaNode(l.sysfsDRMDir, gpuName)
|
|
|
|
if numaNode >= 0 {
|
|
// and store the gpu under that node id
|
|
numaList := numaMapping[numaNode]
|
|
numaList = append(numaList, gpuNum)
|
|
|
|
numaMapping[numaNode] = numaList
|
|
}
|
|
|
|
if memoryAmount < math.MaxInt64 {
|
|
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount))
|
|
}
|
|
}
|
|
|
|
gpuCount := len(gpuNumList)
|
|
|
|
l.labels.addNumericLabel(labelNamespace+tilesLabelName, int64(tileCount))
|
|
|
|
if gpuCount > 0 {
|
|
// add gpu list label (example: "card0.card1.card2") - deprecated
|
|
l.labels[labelNamespace+gpuListLabelName] = pluginutils.SplitAtLastAlphaNum(
|
|
strings.Join(gpuNameList, "."), labelMaxLength, labelControlChar)[0]
|
|
|
|
// add gpu num list label(s) (example: "0.1.2", which is short form of "card0.card1.card2")
|
|
allGPUs := strings.Join(gpuNumList, ".")
|
|
gpuNumLists := pluginutils.SplitAtLastAlphaNum(allGPUs, labelMaxLength, labelControlChar)
|
|
|
|
l.labels[labelNamespace+gpuNumListLabelName] = gpuNumLists[0]
|
|
for i := 1; i < len(gpuNumLists); i++ {
|
|
l.labels[labelNamespace+gpuNumListLabelName+strconv.FormatInt(int64(i+1), 10)] = gpuNumLists[i]
|
|
}
|
|
|
|
if len(numaMapping) > 0 {
|
|
// add numa node mapping to labels: gpu.intel.com/numa-gpu-map="0-0.1.2.3_1-4.5.6.7"
|
|
numaMappingLabel := createNumaNodeMappingLabel(numaMapping)
|
|
|
|
numaMappingLabelList := pluginutils.SplitAtLastAlphaNum(numaMappingLabel, labelMaxLength, labelControlChar)
|
|
|
|
l.labels[labelNamespace+numaMappingName] = numaMappingLabelList[0]
|
|
for i := 1; i < len(numaMappingLabelList); i++ {
|
|
l.labels[labelNamespace+numaMappingName+strconv.FormatInt(int64(i+1), 10)] = numaMappingLabelList[i]
|
|
}
|
|
}
|
|
|
|
// all GPUs get default number of millicores (1000)
|
|
l.labels.addNumericLabel(labelNamespace+millicoreLabelName, int64(millicoresPerGPU*gpuCount))
|
|
|
|
// aa pci-group label(s), (two group example: "1.2.3.4_5.6.7.8")
|
|
allPCIGroups := l.createPCIGroupLabel(gpuNumList)
|
|
if allPCIGroups != "" {
|
|
pciGroups := pluginutils.SplitAtLastAlphaNum(allPCIGroups, labelMaxLength, labelControlChar)
|
|
|
|
l.labels[labelNamespace+pciGroupLabelName] = pciGroups[0]
|
|
for i := 1; i < len(gpuNumLists); i++ {
|
|
l.labels[labelNamespace+pciGroupLabelName+strconv.FormatInt(int64(i+1), 10)] = pciGroups[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
l.labelsChanged = !reflect.DeepEqual(prevLabels, l.labels)
|
|
|
|
return nil
|
|
}
|
|
|
|
func createNumaNodeMappingLabel(mapping map[int][]string) string {
|
|
parts := []string{}
|
|
|
|
numas := []int{}
|
|
for numaNode := range mapping {
|
|
numas = append(numas, numaNode)
|
|
}
|
|
|
|
sort.Ints(numas)
|
|
|
|
for _, numaNode := range numas {
|
|
gpus := mapping[numaNode]
|
|
numaString := strconv.FormatInt(int64(numaNode), 10)
|
|
gpusString := strings.Join(gpus, ".")
|
|
|
|
parts = append(parts, numaString+"-"+gpusString)
|
|
}
|
|
|
|
return strings.Join(parts, "_")
|
|
}
|
|
|
|
func (l *labeler) atomicPrintLabelsToFile(labelFile string) error {
|
|
baseDir := filepath.Dir(labelFile)
|
|
|
|
// TODO: Use NFD's "hidden file" feature when it becomes available.
|
|
d, err := os.MkdirTemp(baseDir, "labels")
|
|
if err != nil {
|
|
klog.Warning("could not create temporary directory, writing directly to destination")
|
|
|
|
return l.printLabelsToFile(labelFile)
|
|
}
|
|
|
|
defer os.RemoveAll(d)
|
|
|
|
tmpFile := filepath.Join(d, "labels.txt")
|
|
|
|
if err := l.printLabelsToFile(tmpFile); err != nil {
|
|
return err
|
|
}
|
|
|
|
return os.Rename(tmpFile, labelFile)
|
|
}
|
|
|
|
func (l *labeler) printLabelsToFile(labelFile string) error {
|
|
f, err := os.OpenFile(labelFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open file (%s): %w", labelFile, err)
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
for key, val := range l.labels {
|
|
if _, err := f.WriteString(key + "=" + val + "\n"); err != nil {
|
|
return fmt.Errorf("failed to write label (%s=%s) to file: %w", key, val, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func CreateAndPrintLabels(sysfsDRMDir string) {
|
|
l := newLabeler(sysfsDRMDir)
|
|
|
|
if err := l.createLabels(); err != nil {
|
|
klog.Warningf("failed to create labels: %+v", err)
|
|
|
|
return
|
|
}
|
|
|
|
for key, val := range l.labels {
|
|
fmt.Println(key + "=" + val)
|
|
}
|
|
}
|
|
|
|
// Gathers node's GPU labels on channel trigger or timeout, and write them to a file.
|
|
// The created label file is deleted on exit (process dying).
|
|
func Run(sysfsDrmDir, nfdFeatureFile string, updateInterval time.Duration, scanResources chan bool) {
|
|
l := newLabeler(sysfsDrmDir)
|
|
|
|
interruptChan := make(chan os.Signal, 1)
|
|
signal.Notify(interruptChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP, syscall.SIGQUIT)
|
|
|
|
klog.V(1).Info("Starting GPU labeler")
|
|
|
|
Loop:
|
|
for {
|
|
timeout := time.After(updateInterval)
|
|
|
|
select {
|
|
case <-timeout:
|
|
case <-scanResources:
|
|
case interrupt := <-interruptChan:
|
|
klog.V(2).Infof("Interrupt %d received", interrupt)
|
|
|
|
break Loop
|
|
}
|
|
|
|
klog.V(1).Info("Ext resources scanning")
|
|
|
|
err := l.createLabels()
|
|
if err != nil {
|
|
klog.Warningf("label creation failed: %+v", err)
|
|
|
|
continue
|
|
}
|
|
|
|
if l.labelsChanged {
|
|
klog.V(1).Info("Writing labels")
|
|
|
|
if err := l.atomicPrintLabelsToFile(nfdFeatureFile); err != nil {
|
|
klog.Warningf("failed to write labels to file: %+v", err)
|
|
|
|
// Reset labels so that next time the labeler runs the writing is retried.
|
|
l.labels = labelMap{}
|
|
}
|
|
}
|
|
}
|
|
|
|
signal.Stop(interruptChan)
|
|
|
|
klog.V(2).Info("Removing label file")
|
|
|
|
err := os.Remove(nfdFeatureFile)
|
|
if err != nil {
|
|
klog.Errorf("Failed to cleanup label file: %+v", err)
|
|
}
|
|
|
|
klog.V(1).Info("Stopping GPU labeler")
|
|
|
|
// Close the whole application
|
|
os.Exit(0)
|
|
}
|