mirror of
https://github.com/intel/intel-device-plugins-for-kubernetes.git
synced 2025-06-03 03:59:37 +00:00

along with it, fix some wsl findings. Signed-off-by: Mikko Ylinen <mikko.ylinen@intel.com>
235 lines
6.3 KiB
Go
235 lines
6.3 KiB
Go
// Copyright 2024 Intel Corporation. All Rights Reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package main
|
|
|
|
// #cgo CFLAGS: "-I/usr/include/level_zero" "-Wall" "-Wextra" "-O2"
|
|
// #cgo LDFLAGS: "-lze_loader"
|
|
// #include "ze.h"
|
|
import "C"
|
|
|
|
import (
|
|
"context"
|
|
"flag"
|
|
"net"
|
|
"os"
|
|
"strconv"
|
|
"unsafe"
|
|
|
|
levelzero "github.com/intel/intel-device-plugins-for-kubernetes/cmd/internal/levelzero"
|
|
"google.golang.org/grpc"
|
|
"k8s.io/klog/v2"
|
|
)
|
|
|
|
type server struct {
|
|
levelzero.UnimplementedLevelzeroServer
|
|
}
|
|
|
|
func retrieveStatusDescription(code uint32) string {
|
|
bSize := 64
|
|
b := make([]byte, bSize)
|
|
|
|
cwritten := C.ze_status_to_string(C.uint32_t(code), (*C.char)(unsafe.Pointer(&b[0])), C.uint32_t(bSize))
|
|
|
|
written := int(cwritten)
|
|
if written <= 0 {
|
|
return "failed to retrieve description"
|
|
}
|
|
|
|
return string(b[0:written])
|
|
}
|
|
|
|
func (s *server) GetDeviceHealth(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceHealth, error) {
|
|
klog.V(3).Infof("Retrieve device health for %s", deviceid.BdfAddress)
|
|
|
|
var errorVal uint32 = 0
|
|
|
|
cBdfAddress := C.CString(deviceid.BdfAddress)
|
|
|
|
memHealth := bool(C.zes_device_memory_is_healthy(cBdfAddress, (*C.uint32_t)(unsafe.Pointer(&errorVal))))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("device memory health read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
busHealth := bool(C.zes_device_bus_is_healthy(cBdfAddress, (*C.uint32_t)(unsafe.Pointer(&errorVal))))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("device bus health read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
var err levelzero.Error
|
|
if errorVal != 0 {
|
|
err.Errorcode = errorVal
|
|
err.Description = retrieveStatusDescription(errorVal)
|
|
} else {
|
|
klog.V(3).Infof("Health for %s: Memory=%t, Bus=%t", deviceid.BdfAddress, memHealth, busHealth)
|
|
}
|
|
|
|
health := &levelzero.DeviceHealth{
|
|
BusOk: busHealth,
|
|
MemoryOk: memHealth,
|
|
SocOk: true, // Placeholder, not available.
|
|
Error: &err,
|
|
}
|
|
|
|
return health, nil
|
|
}
|
|
|
|
func (s *server) GetDeviceTemperature(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceTemperature, error) {
|
|
klog.V(3).Infof("Retrieve device temperature for %s", deviceid.BdfAddress)
|
|
|
|
var errorVal uint32 = 0
|
|
|
|
cBdfAddress := C.CString(deviceid.BdfAddress)
|
|
|
|
globalTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("global"), (*C.uint32_t)(unsafe.Pointer(&errorVal))))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("global temperature read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
gpuTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("gpu"), (*C.uint32_t)(unsafe.Pointer(&errorVal))))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("gpu temperature read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
memTemp := float64(C.zes_device_temp_max(cBdfAddress, C.CString("memory"), (*C.uint32_t)(unsafe.Pointer(&errorVal))))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("memory temperature read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
var err levelzero.Error
|
|
if errorVal != 0 {
|
|
err.Errorcode = errorVal
|
|
err.Description = retrieveStatusDescription(errorVal)
|
|
} else {
|
|
klog.V(3).Infof("Temperatures for %s: Memory=%.1fC, GPU=%.1fC, Global=%.1fC", deviceid.BdfAddress, memTemp, gpuTemp, globalTemp)
|
|
}
|
|
|
|
temps := &levelzero.DeviceTemperature{
|
|
Global: globalTemp,
|
|
Gpu: gpuTemp,
|
|
Memory: memTemp,
|
|
Error: &err,
|
|
}
|
|
|
|
return temps, nil
|
|
}
|
|
|
|
func (s *server) GetIntelIndices(c context.Context, m *levelzero.GetIntelIndicesMessage) (*levelzero.DeviceIndices, error) {
|
|
klog.V(3).Infof("Retrieve Intel indices")
|
|
|
|
errorVal := uint32(0)
|
|
|
|
indices := make([]uint32, 8)
|
|
|
|
// TODO: Move to zes_ version when crash in WSL env is fixed:
|
|
// https://github.com/intel/compute-runtime/issues/721
|
|
count := C.ze_intel_device_indices((*C.uint32_t)(&indices[0]), C.uint32_t(len(indices)), (*C.uint32_t)(unsafe.Pointer(&errorVal)))
|
|
|
|
var err levelzero.Error
|
|
if errorVal != 0 {
|
|
err.Errorcode = errorVal
|
|
err.Description = retrieveStatusDescription(errorVal)
|
|
}
|
|
|
|
ret := levelzero.DeviceIndices{
|
|
Indices: indices[0:count],
|
|
Error: &err,
|
|
}
|
|
|
|
return &ret, nil
|
|
}
|
|
|
|
func (s *server) GetDeviceMemoryAmount(c context.Context, deviceid *levelzero.DeviceId) (*levelzero.DeviceMemoryAmount, error) {
|
|
klog.V(3).Infof("Retrieve device memory amount for %s", deviceid.BdfAddress)
|
|
|
|
errorVal := uint32(0)
|
|
|
|
memSize := C.zes_device_memory_amount(C.CString(deviceid.BdfAddress), (*C.uint32_t)(unsafe.Pointer(&errorVal)))
|
|
|
|
if errorVal != 0 {
|
|
klog.Warningf("device memory amount read returned an error: 0x%X", errorVal)
|
|
}
|
|
|
|
description := retrieveStatusDescription(errorVal)
|
|
|
|
var err levelzero.Error
|
|
if errorVal != 0 {
|
|
err.Errorcode = errorVal
|
|
err.Description = description
|
|
}
|
|
|
|
ret := levelzero.DeviceMemoryAmount{
|
|
MemorySize: uint64(memSize),
|
|
Error: &err,
|
|
}
|
|
|
|
return &ret, nil
|
|
}
|
|
|
|
func main() {
|
|
klog.InitFlags(nil)
|
|
|
|
socketPath := flag.String("socket", levelzero.DefaultUnixSocketPath, "Unix socket path to listen on")
|
|
wslEnv := flag.Bool("wsl", false, "Running in WSL environment")
|
|
|
|
flag.Parse()
|
|
|
|
// Delete possible previous socket file
|
|
_ = os.Remove(*socketPath)
|
|
|
|
verbosity := int64(0)
|
|
|
|
flag.VisitAll(func(f *flag.Flag) {
|
|
if f.Name == "v" {
|
|
if v, err := strconv.ParseInt(f.Value.String(), 10, 16); err == nil {
|
|
verbosity = v
|
|
}
|
|
}
|
|
})
|
|
|
|
lis, err := net.Listen("unix", *socketPath)
|
|
if err != nil {
|
|
klog.Fatalf("failed to listen: %v", err)
|
|
}
|
|
|
|
// TODO: Drop "ze_try_initialize" when crash in WSL env is fixed:
|
|
// https://github.com/intel/compute-runtime/issues/721
|
|
if *wslEnv {
|
|
if !bool(C.ze_try_initialize()) {
|
|
klog.Fatal("Ze Init try failed, cannot continue")
|
|
}
|
|
} else {
|
|
if !bool(C.zes_try_initialize()) {
|
|
klog.Fatal("Zes Init try failed, cannot continue")
|
|
}
|
|
}
|
|
|
|
C.zes_set_verbosity(C.int(verbosity))
|
|
|
|
s := grpc.NewServer()
|
|
|
|
levelzero.RegisterLevelzeroServer(s, &server{})
|
|
|
|
klog.Infof("server listening at %v", lis.Addr())
|
|
|
|
if err := s.Serve(lis); err != nil {
|
|
klog.Fatalf("failed to serve: %v", err)
|
|
}
|
|
}
|