mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00

This corresponds to the previous gpu-plugin skip code. Signed-off-by: Ukri Niemimuukko <ukri.niemimuukko@intel.com>
246 lines
6.8 KiB
Go
246 lines
6.8 KiB
Go
// Copyright 2020 Intel Corporation. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/pluginutils"
|
|
"github.com/pkg/errors"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
const (
|
|
labelNamespace = "gpu.intel.com/"
|
|
gpuListLabelName = "cards"
|
|
millicoreLabelName = "millicores"
|
|
millicoresPerGPU = 1000
|
|
memoryOverrideEnv = "GPU_MEMORY_OVERRIDE"
|
|
memoryReservedEnv = "GPU_MEMORY_RESERVED"
|
|
gpuDeviceRE = `^card[0-9]+$`
|
|
controlDeviceRE = `^controlD[0-9]+$`
|
|
vendorString = "0x8086"
|
|
)
|
|
|
|
type labelMap map[string]string
|
|
|
|
type labeler struct {
|
|
sysfsDRMDir string
|
|
debugfsDRIDir string
|
|
|
|
gpuDeviceReg *regexp.Regexp
|
|
controlDeviceReg *regexp.Regexp
|
|
labels labelMap
|
|
}
|
|
|
|
func newLabeler(sysfsDRMDir, debugfsDRIDir string) *labeler {
|
|
return &labeler{
|
|
sysfsDRMDir: sysfsDRMDir,
|
|
debugfsDRIDir: debugfsDRIDir,
|
|
gpuDeviceReg: regexp.MustCompile(gpuDeviceRE),
|
|
controlDeviceReg: regexp.MustCompile(controlDeviceRE),
|
|
labels: labelMap{},
|
|
}
|
|
}
|
|
|
|
func (l *labeler) scan() ([]string, error) {
|
|
files, err := os.ReadDir(l.sysfsDRMDir)
|
|
gpuNameList := []string{}
|
|
|
|
if err != nil {
|
|
return gpuNameList, errors.Wrap(err, "Can't read sysfs folder")
|
|
}
|
|
|
|
for _, f := range files {
|
|
if !l.gpuDeviceReg.MatchString(f.Name()) {
|
|
klog.V(4).Info("Not compatible device", f.Name())
|
|
continue
|
|
}
|
|
|
|
dat, err := os.ReadFile(path.Join(l.sysfsDRMDir, f.Name(), "device/vendor"))
|
|
if err != nil {
|
|
klog.Warning("Skipping. Can't read vendor file: ", err)
|
|
continue
|
|
}
|
|
|
|
if strings.TrimSpace(string(dat)) != vendorString {
|
|
klog.V(4).Info("Non-Intel GPU", f.Name())
|
|
continue
|
|
}
|
|
|
|
if pluginutils.IsSriovPFwithVFs(path.Join(l.sysfsDRMDir, f.Name())) {
|
|
klog.V(4).Infof("Skipping PF with VF")
|
|
continue
|
|
}
|
|
|
|
_, err = os.ReadDir(path.Join(l.sysfsDRMDir, f.Name(), "device/drm"))
|
|
if err != nil {
|
|
return gpuNameList, errors.Wrap(err, "Can't read device folder")
|
|
}
|
|
|
|
gpuNameList = append(gpuNameList, f.Name())
|
|
}
|
|
|
|
return gpuNameList, nil
|
|
}
|
|
|
|
func getEnvVarNumber(envVarName string) uint64 {
|
|
envValue := os.Getenv(envVarName)
|
|
if envValue != "" {
|
|
val, err := strconv.ParseUint(envValue, 10, 64)
|
|
if err == nil {
|
|
return val
|
|
}
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func fallback() uint64 {
|
|
return getEnvVarNumber(memoryOverrideEnv)
|
|
}
|
|
|
|
// getTileMemoryAmount reads the total GPU memory amount from the GPU tiles and returns it and the tile count.
|
|
func (l *labeler) getTileMemoryAmount(gpuName string) (mem, numTiles uint64) {
|
|
reserved := getEnvVarNumber(memoryReservedEnv)
|
|
filePath := filepath.Join(l.sysfsDRMDir, gpuName, "gt/gt*/addr_range")
|
|
|
|
files, err := filepath.Glob(filePath)
|
|
if err != nil {
|
|
klog.V(4).Info("Can't read sysfs folder", err)
|
|
return fallback(), 1
|
|
}
|
|
|
|
for _, fileName := range files {
|
|
dat, err := os.ReadFile(fileName)
|
|
if err != nil {
|
|
klog.Warning("Skipping. Can't read file: ", err)
|
|
continue
|
|
}
|
|
|
|
n, err := strconv.ParseUint(strings.TrimSpace(string(dat)), 0, 64)
|
|
if err != nil {
|
|
klog.Warning("Skipping. Can't convert addr_range: ", err)
|
|
continue
|
|
}
|
|
|
|
numTiles++
|
|
mem += n
|
|
}
|
|
|
|
if mem == 0 {
|
|
return fallback(), 1
|
|
}
|
|
|
|
return mem - reserved, numTiles
|
|
}
|
|
|
|
// addNumericLabel creates a new label if one doesn't exist. Else the new value is added to the previous value.
|
|
func (lm labelMap) addNumericLabel(labelName string, valueToAdd int64) {
|
|
value := int64(0)
|
|
if numstr, ok := lm[labelName]; ok {
|
|
_, _ = fmt.Sscanf(numstr, "%d", &value)
|
|
}
|
|
value += valueToAdd
|
|
lm[labelName] = strconv.FormatInt(value, 10)
|
|
}
|
|
|
|
// createCapabilityLabels creates labels from the gpu capability file under debugfs.
|
|
func (l *labeler) createCapabilityLabels(cardNum string, numTiles uint64) {
|
|
// try to read the capabilities from the i915_capabilities file
|
|
file, err := os.Open(filepath.Join(l.debugfsDRIDir, cardNum, "i915_capabilities"))
|
|
if err != nil {
|
|
klog.V(3).Infof("Couldn't open file:%s", err.Error()) // debugfs is not stable, there is no need to spam with error level prints
|
|
return
|
|
}
|
|
defer file.Close()
|
|
|
|
// define strings to search from the file, and the actions to take in order to create labels from those strings (as funcs)
|
|
searchStringActionMap := map[string]func(string){
|
|
"platform: ": func(platformName string) {
|
|
l.labels.addNumericLabel(labelNamespace+"platform_"+platformName+".count", 1)
|
|
l.labels[labelNamespace+"platform_"+platformName+".tiles"] = strconv.FormatInt(int64(numTiles), 10)
|
|
l.labels[labelNamespace+"platform_"+platformName+".present"] = "true"
|
|
},
|
|
"gen: ": func(genName string) {
|
|
l.labels[labelNamespace+"platform_gen"] = genName
|
|
},
|
|
}
|
|
|
|
// Finally, read the file, and try to find the matches. Perform actions and reduce the search map size as we proceed. Return at 0 size.
|
|
scanner := bufio.NewScanner(file)
|
|
for scanner.Scan() {
|
|
for searchString, action := range searchStringActionMap {
|
|
var stringValue string
|
|
n, _ := fmt.Sscanf(scanner.Text(), searchString+"%s", &stringValue)
|
|
if n > 0 {
|
|
action(stringValue)
|
|
delete(searchStringActionMap, searchString)
|
|
if len(searchStringActionMap) == 0 {
|
|
return
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// createLabels is the main function of plugin labeler, it creates label-value pairs for the gpus.
|
|
func (l *labeler) createLabels() error {
|
|
gpuNameList, err := l.scan()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, gpuName := range gpuNameList {
|
|
gpuNum := ""
|
|
// extract card number as a string. scan() has already checked name syntax
|
|
_, err = fmt.Sscanf(gpuName, "card%s", &gpuNum)
|
|
if err != nil {
|
|
return errors.Wrap(err, "gpu name parsing error")
|
|
}
|
|
|
|
// read the memory amount to find a proper max allocation value
|
|
memoryAmount, numTiles := l.getTileMemoryAmount(gpuName)
|
|
|
|
// try to add capability labels
|
|
l.createCapabilityLabels(gpuNum, numTiles)
|
|
|
|
l.labels.addNumericLabel(labelNamespace+"memory.max", int64(memoryAmount))
|
|
}
|
|
gpuCount := len(gpuNameList)
|
|
if gpuCount > 0 {
|
|
// add gpu list label (example: "card0.card1.card2")
|
|
l.labels[labelNamespace+gpuListLabelName] = strings.Join(gpuNameList, ".")
|
|
|
|
// all GPUs get default number of millicores (1000)
|
|
l.labels.addNumericLabel(labelNamespace+millicoreLabelName, int64(millicoresPerGPU*gpuCount))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (l *labeler) printLabels() {
|
|
for key, val := range l.labels {
|
|
fmt.Println(key + "=" + val)
|
|
}
|
|
}
|