mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00
Merge pull request #1116 from eero-t/gpu_fakedev
Add fake GPU device generator for scalability testing
This commit is contained in:
commit
b4c2bd3afe
1
.github/workflows/ci.yaml
vendored
1
.github/workflows/ci.yaml
vendored
@ -105,6 +105,7 @@ jobs:
|
||||
image:
|
||||
- intel-fpga-admissionwebhook
|
||||
- intel-fpga-initcontainer
|
||||
- intel-gpu-fakedev
|
||||
- intel-gpu-initcontainer
|
||||
- intel-gpu-plugin
|
||||
- intel-fpga-plugin
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -11,6 +11,7 @@ cmd/fpga_crihook/fpga_crihook
|
||||
cmd/dlb_plugin/dlb_plugin
|
||||
cmd/fpga_plugin/fpga_plugin
|
||||
cmd/fpga_tool/fpga_tool
|
||||
cmd/gpu_fakedev/gpu_fakedev
|
||||
cmd/gpu_nfdhook/gpu_nfdhook
|
||||
cmd/gpu_plugin/gpu_plugin
|
||||
cmd/iaa_plugin/iaa_plugin
|
||||
@ -32,4 +33,4 @@ _build
|
||||
_work
|
||||
|
||||
*.tgz
|
||||
charts/operator/crds
|
||||
charts/operator/crds
|
||||
|
8
build/docker/templates/intel-gpu-fakedev.Dockerfile.in
Normal file
8
build/docker/templates/intel-gpu-fakedev.Dockerfile.in
Normal file
@ -0,0 +1,8 @@
|
||||
#define _ENTRYPOINT_ /usr/local/bin/intel_gpu_fakedev
|
||||
ARG CMD=gpu_fakedev
|
||||
|
||||
#include "default_plugin.docker"
|
||||
|
||||
LABEL name='intel-gpu-fakedev'
|
||||
LABEL summary='Fake device file generator for Intel® GPU plugin'
|
||||
LABEL description='Fake device file generator provides fake sysfs+devfs content for Intel GPU plugin from its initcontainer, for scalability testing'
|
47
cmd/gpu_fakedev/README.md
Normal file
47
cmd/gpu_fakedev/README.md
Normal file
@ -0,0 +1,47 @@
|
||||
# Fake (GPU) device file generator
|
||||
|
||||
Table of Contents
|
||||
* [Introduction](#introduction)
|
||||
* [Configuration](#configuration)
|
||||
* [Potential improvements](#potential-improvements)
|
||||
* [Related tools](#related-tools)
|
||||
|
||||
## Introduction
|
||||
|
||||
This is a tool for generating (large number of) fake device files for
|
||||
k8s device scheduling scalability testing. But it can also be used
|
||||
just to test (GPU) device plugin functionality without having
|
||||
corresponding device HW.
|
||||
|
||||
Its "intel-gpu-fakedev" container is intended to be run as first init
|
||||
container in a device plugin pod, so that device plugin (and its NFD
|
||||
labeler) see the fake (sysfs + devfs) files generated by the tool,
|
||||
instead of real host sysfs and devfs content.
|
||||
|
||||
## Configuration
|
||||
|
||||
[Configs](configs/) subdirectory contains example JSON configuration
|
||||
file(s) for the generator. Currently there's only one example JSON
|
||||
file, but each new device variant adding feature(s) that have specific
|
||||
support in device plugin, could have their own fake device config.
|
||||
|
||||
## Potential improvements
|
||||
|
||||
If support for mixed device environment is needed, tool can be updated
|
||||
to use node / configuration file mapping. Such mappings could be e.g.
|
||||
in configuration files themselves as node name include / exlude lists,
|
||||
and tool would use first configuration file matching the node it's
|
||||
running on. For now, one would need to use different pod / config
|
||||
specs for different nodes to achieve that...
|
||||
|
||||
Currently JSON config file options and the generated files are tied to
|
||||
what GPU plugin uses, but if needed, they could be changed to fake
|
||||
also sysfs + devfs device files used by other plugins.
|
||||
|
||||
## Related tools
|
||||
|
||||
[fakedev-exporter](#https://github.com/intel/fakedev-exporter) project
|
||||
can be used to schedule suitably configured fake workloads on the fake
|
||||
devices, and to provide provide fake activity metrics for them to
|
||||
Prometheus, that look like they were reported by real Prometheus
|
||||
metric exporters for real workloads running on real devices.
|
8
cmd/gpu_fakedev/configs/8x-DG1.json
Normal file
8
cmd/gpu_fakedev/configs/8x-DG1.json
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"Info": "8x 4 GiB DG1 [Iris Xe MAX Graphics] GPUs",
|
||||
"DevCount": 8,
|
||||
"DevMemSize": 4294967296,
|
||||
"Capabilities": {
|
||||
"platform": "fake_DG1",
|
||||
}
|
||||
}
|
313
cmd/gpu_fakedev/gpu_fakedev.go
Normal file
313
cmd/gpu_fakedev/gpu_fakedev.go
Normal file
@ -0,0 +1,313 @@
|
||||
// Copyright 2021-2022 Intel Corporation. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//---------------------------------------------------------------
|
||||
// sysfs SPECIFICATION
|
||||
//
|
||||
// sys/class/drm/cardX/
|
||||
// sys/class/drm/cardX/lmem_total_bytes (gpu memory size, number)
|
||||
// sys/class/drm/cardX/device/
|
||||
// sys/class/drm/cardX/device/vendor (0x8086)
|
||||
// sys/class/drm/cardX/device/sriov_numvfs (PF only, number of VF GPUs, number)
|
||||
// sys/class/drm/cardX/device/drm/
|
||||
// sys/class/drm/cardX/device/drm/cardX/
|
||||
// sys/class/drm/cardX/device/drm/renderD1XX/
|
||||
// sys/class/drm/cardX/device/numa_node (Numa node index[1], number)
|
||||
// [1] indexing these: /sys/devices/system/node/nodeX/
|
||||
//---------------------------------------------------------------
|
||||
// devfs SPECIFICATION
|
||||
//
|
||||
// dev/dri/cardX
|
||||
// dev/dri/renderD1XX
|
||||
//---------------------------------------------------------------
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const (
|
||||
dirMode = 0775
|
||||
fileMode = 0644
|
||||
cardBase = 0
|
||||
renderBase = 128
|
||||
maxDevs = 128
|
||||
sysfsPath = "sys"
|
||||
devfsPath = "dev"
|
||||
mib = 1024.0 * 1024.0
|
||||
// null device major, minor on linux.
|
||||
devNullMajor = 1
|
||||
devNullMinor = 3
|
||||
devNullType = unix.S_IFCHR
|
||||
)
|
||||
|
||||
var verbose bool
|
||||
|
||||
type genOptions struct {
|
||||
Capabilities map[string]string // device capabilities mapping for NFD hook
|
||||
Info string // verbal config description
|
||||
DevCount int // how many devices to fake
|
||||
TilesPerDev int // per-device tile count
|
||||
DevMemSize int // available per-device device-local memory, in bytes
|
||||
DevsPerNode int // How many devices per Numa node
|
||||
VfsPerPf int // How many SR-IOV VFs per PF
|
||||
// fields for counting what was generated
|
||||
files int
|
||||
dirs int
|
||||
devs int
|
||||
}
|
||||
|
||||
func addSysfsDriTree(root string, opts *genOptions, i int) error {
|
||||
card := cardBase + i
|
||||
base := fmt.Sprintf("%s/class/drm/card%d", root, card)
|
||||
|
||||
if err := os.MkdirAll(base, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
|
||||
data := []byte(fmt.Sprintf("%d", opts.DevMemSize))
|
||||
file := fmt.Sprintf("%s/lmem_total_bytes", base)
|
||||
|
||||
if err := os.WriteFile(file, data, fileMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.files++
|
||||
|
||||
path := fmt.Sprintf("%s/device/drm/card%d", base, card)
|
||||
if err := os.MkdirAll(path, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
|
||||
path = fmt.Sprintf("%s/device/drm/renderD%d", base, renderBase+i)
|
||||
if err := os.Mkdir(path, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
|
||||
data = []byte("0x8086")
|
||||
file = fmt.Sprintf("%s/device/vendor", base)
|
||||
|
||||
if err := os.WriteFile(file, data, fileMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.files++
|
||||
|
||||
node := 0
|
||||
if opts.DevsPerNode > 0 {
|
||||
node = i / opts.DevsPerNode
|
||||
}
|
||||
|
||||
data = []byte(fmt.Sprintf("%d", node))
|
||||
file = fmt.Sprintf("%s/device/numa_node", base)
|
||||
|
||||
if err := os.WriteFile(file, data, fileMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.files++
|
||||
|
||||
if opts.VfsPerPf > 0 && i%(opts.VfsPerPf+1) == 0 {
|
||||
data = []byte(fmt.Sprintf("%d", opts.VfsPerPf))
|
||||
file = fmt.Sprintf("%s/device/sriov_numvfs", base)
|
||||
|
||||
if err := os.WriteFile(file, data, fileMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.files++
|
||||
}
|
||||
|
||||
for tile := 0; tile < opts.TilesPerDev; tile++ {
|
||||
path := fmt.Sprintf("%s/gt/gt%d", base, tile)
|
||||
if err := os.MkdirAll(path, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func addDevfsDriTree(root string, opts *genOptions, i int) error {
|
||||
base := fmt.Sprintf("%s/dri", root)
|
||||
if err := os.MkdirAll(base, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
|
||||
mode := uint32(fileMode | devNullType)
|
||||
devid := int(unix.Mkdev(uint32(devNullMajor), uint32(devNullMinor)))
|
||||
|
||||
file := fmt.Sprintf("%s/card%d", base, cardBase+i)
|
||||
if err := unix.Mknod(file, mode, devid); err != nil {
|
||||
return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w",
|
||||
devNullMajor, devNullMinor, file, err)
|
||||
}
|
||||
opts.devs++
|
||||
|
||||
file = fmt.Sprintf("%s/renderD%d", base, renderBase+i)
|
||||
if err := unix.Mknod(file, mode, devid); err != nil {
|
||||
return fmt.Errorf("NULL device (%d:%d) node creation failed for '%s': %w",
|
||||
devNullMajor, devNullMinor, file, err)
|
||||
}
|
||||
opts.devs++
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func addDebugfsDriTree(root string, opts *genOptions, i int) error {
|
||||
base := fmt.Sprintf("%s/kernel/debug/dri/%d", root, i)
|
||||
if err := os.MkdirAll(base, dirMode); err != nil {
|
||||
return err
|
||||
}
|
||||
opts.dirs++
|
||||
|
||||
path := fmt.Sprintf("%s/i915_capabilities", base)
|
||||
f, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileMode)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
opts.files++
|
||||
|
||||
// keys are in random order which provides extra testing for NFD label parsing code
|
||||
for key, value := range opts.Capabilities {
|
||||
line := fmt.Sprintf("%s: %s\n", key, value)
|
||||
if _, err = f.WriteString(line); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func removeExistingDir(path, name string) {
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil && !errors.Is(err, fs.ErrNotExist) {
|
||||
log.Fatalf("ERROR: ReadDir() failed on fake %s path '%s': %v", name, path, err)
|
||||
}
|
||||
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
if name == "sysfs" && len(entries) > 2 {
|
||||
log.Fatalf("ERROR: >2 entries in '%s' - real sysfs?", path)
|
||||
}
|
||||
|
||||
if name == "devfs" && (entries[0].Name() != "dri" || len(entries) > 1) {
|
||||
log.Fatalf("ERROR: >1 entries in '%s', or '%s' != 'dri' - real devfs?", path, entries[0].Name())
|
||||
}
|
||||
|
||||
log.Printf("WARN: removing already existing fake %s path '%s'", name, path)
|
||||
|
||||
if err = os.RemoveAll(path); err != nil {
|
||||
log.Fatalf("ERROR: removing existing %s in '%s' failed: %v", name, path, err)
|
||||
}
|
||||
}
|
||||
|
||||
// generateDriFiles generates the fake sysfs + debugfs + devfs dirs & files according to given options.
|
||||
func generateDriFiles(opts genOptions) {
|
||||
if opts.Info != "" {
|
||||
log.Printf("Config: '%s'", opts.Info)
|
||||
}
|
||||
|
||||
removeExistingDir(devfsPath, "devfs")
|
||||
removeExistingDir(sysfsPath, "sysfs")
|
||||
log.Printf("Generating fake DRI device(s) sysfs, debugfs and devfs content under '%s' & '%s'",
|
||||
sysfsPath, devfsPath)
|
||||
|
||||
opts.dirs, opts.files = 0, 0
|
||||
for i := 0; i < opts.DevCount; i++ {
|
||||
if err := addSysfsDriTree(sysfsPath, &opts, i); err != nil {
|
||||
log.Fatalf("ERROR: dev-%d sysfs tree generation failed: %v", i, err)
|
||||
}
|
||||
|
||||
if err := addDebugfsDriTree(sysfsPath, &opts, i); err != nil {
|
||||
log.Fatalf("ERROR: dev-%d debugfs tree generation failed: %v", i, err)
|
||||
}
|
||||
|
||||
if err := addDevfsDriTree(devfsPath, &opts, i); err != nil {
|
||||
log.Fatalf("ERROR: dev-%d devfs tree generation failed: %v", i, err)
|
||||
}
|
||||
}
|
||||
log.Printf("Done, created %d dirs, %d devices and %d files.", opts.dirs, opts.devs, opts.files)
|
||||
}
|
||||
|
||||
// getOptions parses options from given JSON file, validates and returns them.
|
||||
func getOptions(name string) genOptions {
|
||||
if name == "" {
|
||||
log.Fatal("ERROR: no fake device spec provided")
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(name)
|
||||
if err != nil {
|
||||
log.Fatalf("ERROR: reading JSON spec file '%s' failed: %v", name, err)
|
||||
}
|
||||
|
||||
if verbose {
|
||||
log.Printf("Using fake device spec: %v\n", string(data))
|
||||
}
|
||||
|
||||
var opts genOptions
|
||||
if err = json.Unmarshal(data, &opts); err != nil {
|
||||
log.Fatalf("ERROR: Unmarshaling JSON spec file '%s' failed: %v", name, err)
|
||||
}
|
||||
|
||||
if opts.DevCount < 1 || opts.DevCount > maxDevs {
|
||||
log.Fatalf("ERROR: invalid device count: 1 <= %d <= %d", opts.DevCount, maxDevs)
|
||||
}
|
||||
|
||||
if opts.VfsPerPf > 0 {
|
||||
if opts.TilesPerDev > 0 || opts.DevsPerNode > 0 {
|
||||
log.Fatalf("ERROR: SR-IOV VFs (%d) with device tiles (%d) or Numa nodes (%d) is unsupported for faking",
|
||||
opts.VfsPerPf, opts.TilesPerDev, opts.DevsPerNode)
|
||||
}
|
||||
|
||||
if opts.DevCount%(opts.VfsPerPf+1) != 0 {
|
||||
log.Fatalf("ERROR: %d devices cannot be evenly split to between set of 1 SR-IOV PF + %d VFs",
|
||||
opts.DevCount, opts.VfsPerPf)
|
||||
}
|
||||
}
|
||||
|
||||
if opts.DevsPerNode > opts.DevCount {
|
||||
log.Fatalf("ERROR: DevsPerNode (%d) > DevCount (%d)", opts.DevsPerNode, opts.DevCount)
|
||||
}
|
||||
|
||||
if opts.DevMemSize%mib != 0 {
|
||||
log.Fatalf("ERROR: Invalid memory size (%f MiB), not even MiB", float64(opts.DevMemSize)/mib)
|
||||
}
|
||||
|
||||
return opts
|
||||
}
|
||||
|
||||
func main() {
|
||||
var name string
|
||||
|
||||
flag.StringVar(&name, "json", "", "JSON spec for fake device sysfs, debugfs and devfs content")
|
||||
flag.BoolVar(&verbose, "verbose", false, "More verbose output")
|
||||
flag.Parse()
|
||||
|
||||
generateDriFiles(getOptions(name))
|
||||
}
|
Loading…
Reference in New Issue
Block a user