Refactor recording-rules and alerts code (#3068)

* Refactor recording-rules and alerts code

Signed-off-by: avlitman <alitman@redhat.com>

* Remove promv1 from schema

Signed-off-by: avlitman <alitman@redhat.com>

---------

Signed-off-by: avlitman <alitman@redhat.com>
This commit is contained in:
Aviv Litman 2024-02-18 17:05:42 +02:00 committed by GitHub
parent 24c9eb5706
commit 42ec627e35
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 635 additions and 401 deletions

View File

@ -156,7 +156,7 @@ openshift-ci-image-push:
./hack/build/osci-image-builder.sh ./hack/build/osci-image-builder.sh
generate-doc: build-docgen generate-doc: build-docgen
_out/pkg/monitoring/tools/metricsdocs/metricsdocs > doc/metrics.md _out/tools/metricsdocs/metricsdocs > doc/metrics.md
bootstrap-ginkgo: bootstrap-ginkgo:
${DO_BAZ} ./hack/build/bootstrap-ginkgo.sh ${DO_BAZ} ./hack/build/bootstrap-ginkgo.sh

View File

@ -1,5 +1,8 @@
# Containerized Data Importer metrics # Containerized Data Importer metrics
### kubevirt_cdi_clone_pods_high_restart
The number of CDI clone pods with high restart count. Type: Gauge.
### kubevirt_cdi_cr_ready ### kubevirt_cdi_cr_ready
CDI install ready. Type: Gauge. CDI install ready. Type: Gauge.
@ -9,18 +12,15 @@ DataImportCron has an outdated import. Type: Gauge.
### kubevirt_cdi_datavolume_pending ### kubevirt_cdi_datavolume_pending
Number of DataVolumes pending for default storage class to be configured. Type: Gauge. Number of DataVolumes pending for default storage class to be configured. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_clone_pods_high_restart
The number of CDI clone pods with high restart count. Type: Gauge.
### kubevirt_cdi_import_pods_high_restart ### kubevirt_cdi_import_pods_high_restart
The number of CDI import pods with high restart count. Type: Gauge. The number of CDI import pods with high restart count. Type: Gauge.
### kubevirt_cdi_operator_up ### kubevirt_cdi_operator_up
CDI operator status. Type: Gauge. CDI operator status. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_upload_pods_high_restart ### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge. The number of CDI upload server pods with high restart count. Type: Gauge.

2
go.mod
View File

@ -23,7 +23,7 @@ require (
github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1 github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1
github.com/kubernetes-csi/lib-volume-populator v1.2.1-0.20230316163120-b62a0eee2c56 github.com/kubernetes-csi/lib-volume-populator v1.2.1-0.20230316163120-b62a0eee2c56
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a
github.com/machadovilaca/operator-observability v0.0.9 github.com/machadovilaca/operator-observability v0.0.13
github.com/onsi/ginkgo/v2 v2.12.0 github.com/onsi/ginkgo/v2 v2.12.0
github.com/onsi/gomega v1.27.10 github.com/onsi/gomega v1.27.10
github.com/openshift/api v0.0.0-20240116035456-11ed2fbcb805 github.com/openshift/api v0.0.0-20240116035456-11ed2fbcb805

4
go.sum
View File

@ -1099,8 +1099,8 @@ github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo= github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/lyft/protoc-gen-star v0.6.0/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA= github.com/lyft/protoc-gen-star v0.6.0/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/lyft/protoc-gen-star v0.6.1/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA= github.com/lyft/protoc-gen-star v0.6.1/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/machadovilaca/operator-observability v0.0.9 h1:jL2jVh0YJNA3nSX216X74RDZiEPPvsgqXollYmMOQkg= github.com/machadovilaca/operator-observability v0.0.13 h1:9mhxEjkdE6pcl3ke8chbbAWxx25+K1m4Gq31yo+r2JU=
github.com/machadovilaca/operator-observability v0.0.9/go.mod h1:NGkaR3HEYLScVQf6kQAyxWOSN1ltHcsEvHU/8iIJ8cE= github.com/machadovilaca/operator-observability v0.0.13/go.mod h1:e4Z3VhOXb9InkmSh00JjqBBijE+iD+YMzynBpKB3+gE=
github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=

View File

@ -29,10 +29,10 @@ mkdir -p ${CMD_OUT_DIR}/dump
bazel build \ bazel build \
--verbose_failures \ --verbose_failures \
--config=${ARCHITECTURE} \ --config=${ARCHITECTURE} \
//pkg/monitoring/tools/metricsdocs/... //tools/metricsdocs/...
rm -rf _out/pkg/monitoring/tools/metricsdocs rm -rf _out/tools/metricsdocs
mkdir -p _out/pkg/monitoring/tools/metricsdocs mkdir -p _out/tools/metricsdocs
cp ./bazel-bin/pkg/monitoring/tools/metricsdocs/metricsdocs_/metricsdocs _out/pkg/monitoring/tools/metricsdocs/ cp ./bazel-bin/tools/metricsdocs/metricsdocs_/metricsdocs _out/tools/metricsdocs/
bazel clean bazel clean

View File

@ -23,7 +23,7 @@ set -e
linter_image_tag="v0.0.1" linter_image_tag="v0.0.1"
PROJECT_ROOT="$(readlink -e "$(dirname "${BASH_SOURCE[0]}")"/../../)" PROJECT_ROOT="$(readlink -e "$(dirname "${BASH_SOURCE[0]}")"/../../)"
export METRICS_COLLECTOR_PATH="${METRICS_COLLECTOR_PATH:-${PROJECT_ROOT}/pkg/monitoring/tools/prom-metrics-collector}" export METRICS_COLLECTOR_PATH="${METRICS_COLLECTOR_PATH:-${PROJECT_ROOT}/tools/prom-metrics-collector}"
if [[ ! -d "$METRICS_COLLECTOR_PATH" ]]; then if [[ ! -d "$METRICS_COLLECTOR_PATH" ]]; then
echo "Invalid METRICS_COLLECTOR_PATH: $METRICS_COLLECTOR_PATH is not a valid directory path" echo "Invalid METRICS_COLLECTOR_PATH: $METRICS_COLLECTOR_PATH is not a valid directory path"

View File

@ -24,7 +24,7 @@ import (
"k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/types"
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
"kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller" metrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
) )
const ( const (

View File

@ -1,4 +1,4 @@
package metrics package cdicontroller
import ( import (
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"

View File

@ -1,4 +1,4 @@
package metrics package cdicontroller
import ( import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics" "github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics package cdicontroller
import ( import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics" "github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics package cdicontroller
import ( import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics" "github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics package operatorcontroller
import ( import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics" "github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics package operatorcontroller
import ( import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics" "github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -0,0 +1,15 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["rules.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules",
visibility = ["//visibility:public"],
deps = [
"//pkg/common:go_default_library",
"//pkg/monitoring/rules/alerts:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
],
)

View File

@ -0,0 +1,18 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"operator.go",
"prometheus.go",
],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts",
visibility = ["//visibility:public"],
deps = [
"//pkg/common:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
"//vendor/k8s.io/utils/ptr:go_default_library",
],
)

View File

@ -0,0 +1,109 @@
package alerts
import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)
var operatorAlerts = []promv1.Rule{
{
Alert: "CDIOperatorDown",
Expr: intstr.FromString("kubevirt_cdi_operator_up == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI operator is down",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDINotReady",
Expr: intstr.FromString("kubevirt_cdi_cr_ready == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI is not available to use",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDIDataVolumeUnusualRestartCount",
Expr: intstr.FromString("kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIStorageProfilesIncomplete",
Expr: intstr.FromString(`sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIDataImportCronOutdated",
Expr: intstr.FromString("sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0"),
For: (*promv1.Duration)(ptr.To("15m")),
Annotations: map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDINoDefaultStorageClass",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) +
(count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIMultipleDefaultVirtStorageClasses",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "More than one default virtualization StorageClass detected",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIDefaultStorageClassDegraded",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
}

View File

@ -0,0 +1,58 @@
package alerts
import (
"errors"
"fmt"
"os"
"strings"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"kubevirt.io/containerized-data-importer/pkg/common"
)
const (
prometheusRunbookAnnotationKey = "runbook_url"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
operatorHealthImpactLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
componentAlertLabelKey = "kubernetes_operator_component"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelValue = common.CDILabelValue
)
// Register sets up alert rules in the given namespace.
func Register(namespace string) error {
alerts := [][]promv1.Rule{
operatorAlerts,
}
runbookURLTemplate := GetRunbookURLTemplate()
for _, alertGroup := range alerts {
for _, alert := range alertGroup {
alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue
alert.Labels[componentAlertLabelKey] = componentAlertLabelValue
alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert)
}
}
return operatorrules.RegisterAlerts(alerts...)
}
// GetRunbookURLTemplate fetches or defaults the runbook URL template.
func GetRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}
if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}
return runbookURLTemplate
}

View File

@ -2,8 +2,17 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library( go_library(
name = "go_default_library", name = "go_default_library",
srcs = ["recordingrules.go"], srcs = [
"operator.go",
"pods.go",
"recordingrules.go",
],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules", importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules",
visibility = ["//visibility:public"], visibility = ["//visibility:public"],
deps = ["//pkg/common:go_default_library"], deps = [
"//pkg/common:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatormetrics:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
],
) )

View File

@ -0,0 +1,24 @@
package recordingrules
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"
)
func operatorRecordingRules(namespace string) []operatorrules.RecordingRule {
return []operatorrules.RecordingRule{
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_operator_up",
Help: "CDI operator status",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
),
},
}
}

View File

@ -0,0 +1,45 @@
package recordingrules
import (
"fmt"
"strconv"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"
"kubevirt.io/containerized-data-importer/pkg/common"
)
var podsRecordingRules = []operatorrules.RecordingRule{
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_import_pods_high_restart",
Help: "The number of CDI import pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_upload_pods_high_restart",
Help: "The number of CDI upload server pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_clone_pods_high_restart",
Help: "The number of CDI clone pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s) or on() vector(0)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
}

View File

@ -1,59 +1,11 @@
package recordingrules package recordingrules
import ( import "github.com/machadovilaca/operator-observability/pkg/operatorrules"
"fmt"
"strconv"
"kubevirt.io/containerized-data-importer/pkg/common" // Register sets up recording rules in the given namespace.
) func Register(namespace string) error {
return operatorrules.RegisterRecordingRules(
// MetricOpts represent CDI Prometheus Metrics operatorRecordingRules(namespace),
type MetricOpts struct { podsRecordingRules,
Name string )
Help string
Type string
}
// RecordRulesDesc represent CDI Prometheus Record Rules
type RecordRulesDesc struct {
Opts MetricOpts
Expr string
}
// GetRecordRulesDesc returns CDI Prometheus Record Rules
func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
return []RecordRulesDesc{
{
MetricOpts{
"kubevirt_cdi_operator_up",
"CDI operator status",
"Gauge",
},
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
},
{
MetricOpts{
"kubevirt_cdi_import_pods_high_restart",
"The number of CDI import pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_upload_pods_high_restart",
"The number of CDI upload server pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_clone_pods_high_restart",
"The number of CDI clone pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s) or on() vector(0)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
}
} }

View File

@ -0,0 +1,49 @@
package rules
import (
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"kubevirt.io/containerized-data-importer/pkg/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
)
const (
ruleName = "prometheus-cdi-rules"
)
// SetupRules initializes recording and alert rules in a namespace.
func SetupRules(namespace string) error {
if err := recordingrules.Register(namespace); err != nil {
return err
}
if err := alerts.Register(namespace); err != nil {
return err
}
return nil
}
// BuildPrometheusRule creates a PrometheusRule in a namespace.
func BuildPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
return operatorrules.BuildPrometheusRule(
ruleName,
namespace,
map[string]string{
common.CDIComponentLabel: "",
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
)
}
// ListRecordingRules returns all configured recording rules.
func ListRecordingRules() []operatorrules.RecordingRule {
return operatorrules.ListRecordingRules()
}
// ListAlerts returns all configured alert rules.
func ListAlerts() []promv1.Rule {
return operatorrules.ListAlerts()
}

View File

@ -1,81 +0,0 @@
package main
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/docs"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
)
const tpl = `# Containerized Data Importer metrics
{{- range . }}
{{ $deprecatedVersion := "" -}}
{{- with index .ExtraFields "DeprecatedVersion" -}}
{{- $deprecatedVersion = printf " in %s" . -}}
{{- end -}}
{{- $stabilityLevel := "" -}}
{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}}
{{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}}
{{- end -}}
### {{ .Name }}
{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}.
{{- end }}
## Developing new metrics
All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
`
func main() {
err := operatorMetrics.SetupMetrics()
if err != nil {
panic(err)
}
err = cdiMetrics.SetupMetrics()
if err != nil {
panic(err)
}
metricsList := operatorMetrics.ListMetrics()
recordingRulesList := convertToRecordingRules(recordingrules.GetRecordRulesDesc(""))
docsString := docs.BuildMetricsDocsWithCustomTemplate(metricsList, recordingRulesList, tpl)
fmt.Print(docsString)
}
func convertToRecordingRules(recordRulesDesc []recordingrules.RecordRulesDesc) []operatorrules.RecordingRule {
var recordingRules []operatorrules.RecordingRule
for _, ruleDesc := range recordRulesDesc {
recordingRule := operatorrules.RecordingRule{
MetricsOpts: operatormetrics.MetricOpts{
Name: ruleDesc.Opts.Name,
Help: ruleDesc.Opts.Help,
// Assuming the rest of the fields are correctly mapped
},
MetricType: convertRulesType(ruleDesc.Opts.Type),
}
recordingRules = append(recordingRules, recordingRule)
}
return recordingRules
}
// when adding new recording rule please note that
func convertRulesType(metricType string) operatormetrics.MetricType {
if metricType == "Gauge" {
return operatormetrics.GaugeType
// ... other cases ...
}
return ""
}

View File

@ -24,7 +24,7 @@ go_library(
"//pkg/controller/common:go_default_library", "//pkg/controller/common:go_default_library",
"//pkg/feature-gates:go_default_library", "//pkg/feature-gates:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library", "//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library", "//pkg/monitoring/rules:go_default_library",
"//pkg/operator:go_default_library", "//pkg/operator:go_default_library",
"//pkg/operator/resources/cert:go_default_library", "//pkg/operator/resources/cert:go_default_library",
"//pkg/operator/resources/cluster:go_default_library", "//pkg/operator/resources/cluster:go_default_library",
@ -59,7 +59,6 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library", "//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime/schema:go_default_library", "//vendor/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library", "//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/strategicpatch:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/strategicpatch:go_default_library",
"//vendor/k8s.io/apiserver/pkg/authentication/user:go_default_library", "//vendor/k8s.io/apiserver/pkg/authentication/user:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library", "//vendor/k8s.io/client-go/kubernetes:go_default_library",
@ -94,6 +93,8 @@ go_test(
embed = [":go_default_library"], embed = [":go_default_library"],
deps = [ deps = [
"//pkg/common:go_default_library", "//pkg/common:go_default_library",
"//pkg/monitoring/rules:go_default_library",
"//pkg/monitoring/rules/alerts:go_default_library",
"//pkg/operator/resources/cert:go_default_library", "//pkg/operator/resources/cert:go_default_library",
"//pkg/operator/resources/cluster:go_default_library", "//pkg/operator/resources/cluster:go_default_library",
"//pkg/operator/resources/namespaced:go_default_library", "//pkg/operator/resources/namespaced:go_default_library",

View File

@ -40,6 +40,7 @@ import (
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
metrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller" metrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
"kubevirt.io/containerized-data-importer/pkg/operator" "kubevirt.io/containerized-data-importer/pkg/operator"
cdicerts "kubevirt.io/containerized-data-importer/pkg/operator/resources/cert" cdicerts "kubevirt.io/containerized-data-importer/pkg/operator/resources/cert"
cdicluster "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster" cdicluster "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster"
@ -129,6 +130,11 @@ func newReconciler(mgr manager.Manager) (*ReconcileCDI, error) {
return nil, err return nil, err
} }
err = rules.SetupRules(namespace)
if err != nil {
return nil, err
}
recorder := mgr.GetEventRecorderFor("operator-controller") recorder := mgr.GetEventRecorderFor("operator-controller")
r := &ReconcileCDI{ r := &ReconcileCDI{

View File

@ -25,9 +25,8 @@ import (
"strconv" "strconv"
"strings" "strings"
"kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks"
sdkapi "kubevirt.io/controller-lifecycle-operator-sdk/api" sdkapi "kubevirt.io/controller-lifecycle-operator-sdk/api"
"kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks"
sdkr "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/reconciler" sdkr "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/reconciler"
. "github.com/onsi/ginkgo/v2" . "github.com/onsi/ginkgo/v2"
@ -57,6 +56,8 @@ import (
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1" cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
"kubevirt.io/containerized-data-importer/pkg/common" "kubevirt.io/containerized-data-importer/pkg/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts"
clusterResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster" clusterResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster"
namespaceResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced" namespaceResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced"
utils "kubevirt.io/containerized-data-importer/pkg/operator/resources/utils" utils "kubevirt.io/containerized-data-importer/pkg/operator/resources/utils"
@ -285,7 +286,7 @@ var _ = Describe("Controller", func() {
doReconcile(args) doReconcile(args)
Expect(setDeploymentsReady(args)).To(BeTrue()) Expect(setDeploymentsReady(args)).To(BeTrue())
runbookURLTemplate := getRunbookURLTemplate() runbookURLTemplate := alerts.GetRunbookURLTemplate()
rule := &promv1.PrometheusRule{ rule := &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@ -313,7 +314,7 @@ var _ = Describe("Controller", func() {
}, },
} }
Expect(rule.Spec.Groups[0].Rules).To(ContainElement(cdiDownAlert)) Expect(rule.Spec.Groups[1].Rules).To(ContainElement(cdiDownAlert))
Expect(rule.Labels[common.AppKubernetesPartOfLabel]).To(Equal("testing")) Expect(rule.Labels[common.AppKubernetesPartOfLabel]).To(Equal("testing"))
validateEvents(args.reconciler, createReadyEventValidationMap()) validateEvents(args.reconciler, createReadyEventValidationMap())
}) })
@ -1737,6 +1738,11 @@ func createReconciler(client client.Client) *ReconcileCDI {
Namespace: namespace, Namespace: namespace,
} }
err := rules.SetupRules(namespace)
if err != nil {
panic(err)
}
recorder := record.NewFakeRecorder(250) recorder := record.NewFakeRecorder(250)
r := &ReconcileCDI{ r := &ReconcileCDI{
client: client, client: client,

View File

@ -18,11 +18,9 @@ package controller
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"os" "os"
"reflect" "reflect"
"strings"
"github.com/go-logr/logr" "github.com/go-logr/logr"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
@ -31,14 +29,13 @@ import (
"k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
"sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/source" "sigs.k8s.io/controller-runtime/pkg/source"
"kubevirt.io/containerized-data-importer/pkg/common" "kubevirt.io/containerized-data-importer/pkg/common"
cc "kubevirt.io/containerized-data-importer/pkg/controller/common" cc "kubevirt.io/containerized-data-importer/pkg/controller/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules" "kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
cdinamespaced "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced" cdinamespaced "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced"
"kubevirt.io/containerized-data-importer/pkg/util" "kubevirt.io/containerized-data-importer/pkg/util"
@ -52,12 +49,6 @@ const (
defaultMonitoringNs = "monitoring" defaultMonitoringNs = "monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s" defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE" runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = common.CDILabelValue
) )
func ensurePrometheusResourcesExist(ctx context.Context, c client.Client, scheme *runtime.Scheme, owner metav1.Object) error { func ensurePrometheusResourcesExist(ctx context.Context, c client.Client, scheme *runtime.Scheme, owner metav1.Object) error {
@ -134,166 +125,15 @@ func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string)
return true, nil return true, nil
} }
func getRecordRules(namespace string) []promv1.Rule {
var recordRules []promv1.Rule
for _, rrd := range recordingrules.GetRecordRulesDesc(namespace) {
recordRules = append(recordRules, generateRecordRule(rrd.Opts.Name, rrd.Expr))
}
return recordRules
}
func getAlertRules(runbookURLTemplate string) []promv1.Rule {
return []promv1.Rule{
generateAlertRule(
"CDIOperatorDown",
"kubevirt_cdi_operator_up == 0",
promv1.Duration("5m"),
map[string]string{
"summary": "CDI operator is down",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIOperatorDown"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINotReady",
"kubevirt_cdi_cr_ready == 0",
promv1.Duration("5m"),
map[string]string{
"summary": "CDI is not available to use",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINotReady"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataVolumeUnusualRestartCount",
`kubevirt_cdi_import_pods_high_restart > 0 or
kubevirt_cdi_upload_pods_high_restart > 0 or
kubevirt_cdi_clone_pods_high_restart > 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIStorageProfilesIncomplete",
`sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`,
promv1.Duration("5m"),
map[string]string{
"summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIStorageProfilesIncomplete"),
},
map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataImportCronOutdated",
`sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0`,
promv1.Duration("15m"),
map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataImportCronOutdated"),
},
map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINoDefaultStorageClass",
`sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) +
(count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINoDefaultStorageClass"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIMultipleDefaultVirtStorageClasses",
`sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`,
promv1.Duration("5m"),
map[string]string{
"summary": "More than one default virtualization StorageClass detected",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIMultipleDefaultVirtStorageClasses"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDefaultStorageClassDegraded",
`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDefaultStorageClassDegraded"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
}
}
func newPrometheusRule(namespace string) *promv1.PrometheusRule { func newPrometheusRule(namespace string) *promv1.PrometheusRule {
runbookURLTemplate := getRunbookURLTemplate() promRule, err := rules.BuildPrometheusRule(namespace)
if err != nil {
panic(err)
}
return &promv1.PrometheusRule{ return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: promRule.ObjectMeta,
Name: ruleName, Spec: promRule.Spec,
Namespace: namespace,
Labels: map[string]string{
common.CDIComponentLabel: "",
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cdi.rules",
Rules: append(getRecordRules(namespace), getAlertRules(runbookURLTemplate)...),
},
},
},
} }
} }
@ -381,23 +221,6 @@ func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor {
} }
} }
func generateAlertRule(alert, expr string, duration promv1.Duration, annotations, labels map[string]string) promv1.Rule {
return promv1.Rule{
Alert: alert,
Expr: intstr.FromString(expr),
For: &duration,
Annotations: annotations,
Labels: labels,
}
}
func generateRecordRule(record, expr string) promv1.Rule {
return promv1.Rule{
Record: record,
Expr: intstr.FromString(expr),
}
}
func (r *ReconcileCDI) watchPrometheusResources() error { func (r *ReconcileCDI) watchPrometheusResources() error {
listObjs := []client.ObjectList{ listObjs := []client.ObjectList{
&promv1.PrometheusRuleList{}, &promv1.PrometheusRuleList{},
@ -439,16 +262,3 @@ func (r *ReconcileCDI) watchPrometheusResources() error {
return nil return nil
} }
func getRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}
if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}
return runbookURLTemplate
}

View File

@ -3,15 +3,13 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
go_library( go_library(
name = "go_default_library", name = "go_default_library",
srcs = ["metricsdocs.go"], srcs = ["metricsdocs.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/tools/metricsdocs", importpath = "kubevirt.io/containerized-data-importer/tools/metricsdocs",
visibility = ["//visibility:private"], visibility = ["//visibility:private"],
deps = [ deps = [
"//pkg/monitoring/metrics/cdi-controller:go_default_library", "//pkg/monitoring/metrics/cdi-controller:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library", "//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library", "//pkg/monitoring/rules:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/docs:go_default_library", "//vendor/github.com/machadovilaca/operator-observability/pkg/docs:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatormetrics:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
], ],
) )

View File

@ -0,0 +1,56 @@
package main
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/docs"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
)
const tpl = `# Containerized Data Importer metrics
{{- range . }}
{{ $deprecatedVersion := "" -}}
{{- with index .ExtraFields "DeprecatedVersion" -}}
{{- $deprecatedVersion = printf " in %s" . -}}
{{- end -}}
{{- $stabilityLevel := "" -}}
{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}}
{{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}}
{{- end -}}
### {{ .Name }}
{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}.
{{- end }}
## Developing new metrics
All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
`
func main() {
err := operatorMetrics.SetupMetrics()
if err != nil {
panic(err)
}
err = cdiMetrics.SetupMetrics()
if err != nil {
panic(err)
}
if err := rules.SetupRules("test"); err != nil {
panic(err)
}
docsString := docs.BuildMetricsDocsWithCustomTemplate(operatorMetrics.ListMetrics(), rules.ListRecordingRules(), tpl)
fmt.Print(docsString)
}

View File

@ -3,12 +3,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
go_library( go_library(
name = "go_default_library", name = "go_default_library",
srcs = ["metrics_json_generator.go"], srcs = ["metrics_json_generator.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/tools/prom-metrics-collector", importpath = "kubevirt.io/containerized-data-importer/tools/prom-metrics-collector",
visibility = ["//visibility:private"], visibility = ["//visibility:private"],
deps = [ deps = [
"//pkg/monitoring/metrics/cdi-controller:go_default_library", "//pkg/monitoring/metrics/cdi-controller:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library", "//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library", "//pkg/monitoring/rules:go_default_library",
"//vendor/github.com/kubevirt/monitoring/pkg/metrics/parser:go_default_library", "//vendor/github.com/kubevirt/monitoring/pkg/metrics/parser:go_default_library",
], ],
) )

View File

@ -8,7 +8,7 @@ import (
"github.com/kubevirt/monitoring/pkg/metrics/parser" "github.com/kubevirt/monitoring/pkg/metrics/parser"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller" cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller" operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules" "kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
) )
// This should be used only for very rare cases where the naming conventions that are explained in the best practices: // This should be used only for very rare cases where the naming conventions that are explained in the best practices:
@ -27,6 +27,10 @@ func main() {
panic(err) panic(err)
} }
if err := rules.SetupRules("test"); err != nil {
panic(err)
}
var metricFamilies []parser.Metric var metricFamilies []parser.Metric
metricsList := operatorMetrics.ListMetrics() metricsList := operatorMetrics.ListMetrics()
@ -40,13 +44,15 @@ func main() {
} }
} }
recordingRules := recordingrules.GetRecordRulesDesc("") rulesList := rules.ListRecordingRules()
for _, r := range recordingRules { for _, r := range rulesList {
metricFamilies = append(metricFamilies, parser.Metric{ if _, isExcludedMetric := excludedMetrics[r.GetOpts().Name]; !isExcludedMetric {
Name: r.Opts.Name, metricFamilies = append(metricFamilies, parser.Metric{
Help: r.Opts.Help, Name: r.GetOpts().Name,
Type: strings.ToUpper(r.Opts.Type), Help: r.GetOpts().Help,
}) Type: strings.ToUpper(string(r.GetType())),
})
}
} }
jsonBytes, err := json.Marshal(metricFamilies) jsonBytes, err := json.Marshal(metricFamilies)

View File

@ -63,6 +63,8 @@ func BuildMetricsDocsWithCustomTemplate(
allDocs = append(allDocs, buildMetricsDocs(recordingRules)...) allDocs = append(allDocs, buildMetricsDocs(recordingRules)...)
} }
sortMetricsDocs(allDocs)
buf := bytes.NewBufferString("") buf := bytes.NewBufferString("")
err = tpl.Execute(buf, allDocs) err = tpl.Execute(buf, allDocs)
if err != nil { if err != nil {
@ -89,7 +91,6 @@ func buildMetricsDocs[T docOptions](items []T) []metricDocs {
ExtraFields: metricOpts.ExtraFields, ExtraFields: metricOpts.ExtraFields,
} }
} }
sortMetricsDocs(metricsDocs)
return metricsDocs return metricsDocs
} }

View File

@ -2,6 +2,7 @@ package operatormetrics
import ( import (
"fmt" "fmt"
"strings"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
@ -24,6 +25,16 @@ type CollectorResult struct {
Value float64 Value float64
} }
func (c Collector) hash() string {
var sb strings.Builder
for _, cm := range c.Metrics {
sb.WriteString(cm.GetOpts().Name)
}
return sb.String()
}
func (c Collector) Describe(ch chan<- *prometheus.Desc) { func (c Collector) Describe(ch chan<- *prometheus.Desc) {
for _, cm := range c.Metrics { for _, cm := range c.Metrics {
cm.getCollector().Describe(ch) cm.getCollector().Describe(ch)

View File

@ -5,6 +5,10 @@ import (
) )
type RegistryFunc func(c prometheus.Collector) error type RegistryFunc func(c prometheus.Collector) error
type UnregisterFunc func(c prometheus.Collector) bool
// Register is the function used to register metrics and collectors by this package. // Register is the function used to register metrics and collectors by this package.
var Register RegistryFunc = prometheus.Register var Register RegistryFunc = prometheus.Register
// Unregister is the function used to unregister metrics and collectors by this package.
var Unregister UnregisterFunc = prometheus.Unregister

View File

@ -1,15 +1,24 @@
package operatormetrics package operatormetrics
import (
"cmp"
"fmt"
"slices"
)
var operatorRegistry = newRegistry() var operatorRegistry = newRegistry()
type operatorRegisterer struct { type operatorRegisterer struct {
registeredMetrics map[string]Metric registeredMetrics map[string]Metric
registeredCollectors map[string]Collector
registeredCollectorMetrics map[string]Metric registeredCollectorMetrics map[string]Metric
} }
func newRegistry() operatorRegisterer { func newRegistry() operatorRegisterer {
return operatorRegisterer{ return operatorRegisterer{
registeredMetrics: map[string]Metric{}, registeredMetrics: map[string]Metric{},
registeredCollectors: map[string]Collector{},
registeredCollectorMetrics: map[string]Metric{}, registeredCollectorMetrics: map[string]Metric{},
} }
} }
@ -18,11 +27,17 @@ func newRegistry() operatorRegisterer {
func RegisterMetrics(allMetrics ...[]Metric) error { func RegisterMetrics(allMetrics ...[]Metric) error {
for _, metricList := range allMetrics { for _, metricList := range allMetrics {
for _, metric := range metricList { for _, metric := range metricList {
err := Register(metric.getCollector()) if metricExists(metric) {
err := unregisterMetric(metric)
if err != nil {
return err
}
}
err := registerMetric(metric)
if err != nil { if err != nil {
return err return err
} }
operatorRegistry.registeredMetrics[metric.GetOpts().Name] = metric
} }
} }
@ -32,13 +47,16 @@ func RegisterMetrics(allMetrics ...[]Metric) error {
// RegisterCollector registers the collector with the Prometheus registry. // RegisterCollector registers the collector with the Prometheus registry.
func RegisterCollector(collectors ...Collector) error { func RegisterCollector(collectors ...Collector) error {
for _, collector := range collectors { for _, collector := range collectors {
err := Register(collector) if collectorExists(collector) {
if err != nil { err := unregisterCollector(collector)
return err if err != nil {
return err
}
} }
for _, cm := range collector.Metrics { err := registerCollector(collector)
operatorRegistry.registeredCollectorMetrics[cm.GetOpts().Name] = cm if err != nil {
return err
} }
} }
@ -57,5 +75,92 @@ func ListMetrics() []Metric {
result = append(result, rc) result = append(result, rc)
} }
slices.SortFunc(result, func(a, b Metric) int {
return cmp.Compare(a.GetOpts().Name, b.GetOpts().Name)
})
return result return result
} }
// CleanRegistry removes all registered metrics.
func CleanRegistry() error {
for _, metric := range operatorRegistry.registeredMetrics {
err := unregisterMetric(metric)
if err != nil {
return err
}
}
for _, collector := range operatorRegistry.registeredCollectors {
err := unregisterCollector(collector)
if err != nil {
return err
}
}
return nil
}
func metricExists(metric Metric) bool {
_, ok := operatorRegistry.registeredMetrics[metric.GetOpts().Name]
return ok
}
func unregisterMetric(metric Metric) error {
if succeeded := Unregister(metric.getCollector()); succeeded {
delete(operatorRegistry.registeredMetrics, metric.GetOpts().Name)
return nil
}
return fmt.Errorf("failed to unregister from Prometheus client metric %s", metric.GetOpts().Name)
}
func registerMetric(metric Metric) error {
err := Register(metric.getCollector())
if err != nil {
return err
}
operatorRegistry.registeredMetrics[metric.GetOpts().Name] = metric
return nil
}
func collectorExists(collector Collector) bool {
_, ok := operatorRegistry.registeredCollectors[collector.hash()]
return ok
}
func unregisterCollector(collector Collector) error {
if succeeded := Unregister(collector); succeeded {
delete(operatorRegistry.registeredCollectors, collector.hash())
for _, metric := range collector.Metrics {
delete(operatorRegistry.registeredCollectorMetrics, metric.GetOpts().Name)
}
return nil
}
return fmt.Errorf("failed to unregister from Prometheus client collector with metrics: %s", buildCollectorMetricListString(collector))
}
func registerCollector(collector Collector) error {
err := Register(collector)
if err != nil {
return err
}
operatorRegistry.registeredCollectors[collector.hash()] = collector
for _, cm := range collector.Metrics {
operatorRegistry.registeredCollectorMetrics[cm.GetOpts().Name] = cm
}
return nil
}
func buildCollectorMetricListString(collector Collector) string {
metricsList := ""
for _, metric := range collector.Metrics {
metricsList += metric.GetOpts().Name + ", "
}
metricsList = metricsList[:len(metricsList)-2]
return metricsList
}

View File

@ -1,7 +1,9 @@
package operatorrules package operatorrules
import ( import (
"cmp"
"fmt" "fmt"
"slices"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -42,7 +44,7 @@ func buildPrometheusRuleSpec() (*promv1.PrometheusRuleSpec, error) {
if len(operatorRegistry.registeredAlerts) != 0 { if len(operatorRegistry.registeredAlerts) != 0 {
groups = append(groups, promv1.RuleGroup{ groups = append(groups, promv1.RuleGroup{
Name: "alerts.rules", Name: "alerts.rules",
Rules: buildAlertsRules(), Rules: ListAlerts(),
}) })
} }
@ -63,11 +65,9 @@ func buildRecordingRulesRules() []promv1.Rule {
}) })
} }
return rules slices.SortFunc(rules, func(a, b promv1.Rule) int {
} return cmp.Compare(a.Record, b.Record)
})
func buildAlertsRules() []promv1.Rule {
var rules []promv1.Rule
rules = append(rules, operatorRegistry.registeredAlerts...)
return rules return rules
} }

View File

@ -1,26 +1,32 @@
package operatorrules package operatorrules
import ( import (
"cmp"
"slices"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
) )
var operatorRegistry = newRegistry() var operatorRegistry = newRegistry()
type operatorRegisterer struct { type operatorRegisterer struct {
registeredRecordingRules []RecordingRule registeredRecordingRules map[string]RecordingRule
registeredAlerts []promv1.Rule registeredAlerts map[string]promv1.Rule
} }
func newRegistry() operatorRegisterer { func newRegistry() operatorRegisterer {
return operatorRegisterer{ return operatorRegisterer{
registeredRecordingRules: []RecordingRule{}, registeredRecordingRules: map[string]RecordingRule{},
registeredAlerts: map[string]promv1.Rule{},
} }
} }
// RegisterRecordingRules registers the given recording rules. // RegisterRecordingRules registers the given recording rules.
func RegisterRecordingRules(recordingRules ...[]RecordingRule) error { func RegisterRecordingRules(recordingRules ...[]RecordingRule) error {
for _, recordingRuleList := range recordingRules { for _, recordingRuleList := range recordingRules {
operatorRegistry.registeredRecordingRules = append(operatorRegistry.registeredRecordingRules, recordingRuleList...) for _, recordingRule := range recordingRuleList {
operatorRegistry.registeredRecordingRules[recordingRule.MetricsOpts.Name] = recordingRule
}
} }
return nil return nil
@ -29,7 +35,9 @@ func RegisterRecordingRules(recordingRules ...[]RecordingRule) error {
// RegisterAlerts registers the given alerts. // RegisterAlerts registers the given alerts.
func RegisterAlerts(alerts ...[]promv1.Rule) error { func RegisterAlerts(alerts ...[]promv1.Rule) error {
for _, alertList := range alerts { for _, alertList := range alerts {
operatorRegistry.registeredAlerts = append(operatorRegistry.registeredAlerts, alertList...) for _, alert := range alertList {
operatorRegistry.registeredAlerts[alert.Alert] = alert
}
} }
return nil return nil
@ -37,10 +45,34 @@ func RegisterAlerts(alerts ...[]promv1.Rule) error {
// ListRecordingRules returns the registered recording rules. // ListRecordingRules returns the registered recording rules.
func ListRecordingRules() []RecordingRule { func ListRecordingRules() []RecordingRule {
return operatorRegistry.registeredRecordingRules var rules []RecordingRule
for _, rule := range operatorRegistry.registeredRecordingRules {
rules = append(rules, rule)
}
slices.SortFunc(rules, func(a, b RecordingRule) int {
return cmp.Compare(a.GetOpts().Name, b.GetOpts().Name)
})
return rules
} }
// ListAlerts returns the registered alerts. // ListAlerts returns the registered alerts.
func ListAlerts() []promv1.Rule { func ListAlerts() []promv1.Rule {
return operatorRegistry.registeredAlerts var alerts []promv1.Rule
for _, alert := range operatorRegistry.registeredAlerts {
alerts = append(alerts, alert)
}
slices.SortFunc(alerts, func(a, b promv1.Rule) int {
return cmp.Compare(a.Alert, b.Alert)
})
return alerts
}
// CleanRegistry removes all registered rules and alerts.
func CleanRegistry() error {
operatorRegistry = newRegistry()
return nil
} }

4
vendor/modules.txt vendored
View File

@ -357,8 +357,8 @@ github.com/kubernetes-csi/lib-volume-populator/populator-machinery
# github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a # github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a
## explicit; go 1.20 ## explicit; go 1.20
github.com/kubevirt/monitoring/pkg/metrics/parser github.com/kubevirt/monitoring/pkg/metrics/parser
# github.com/machadovilaca/operator-observability v0.0.9 # github.com/machadovilaca/operator-observability v0.0.13
## explicit; go 1.20 ## explicit; go 1.21
github.com/machadovilaca/operator-observability/pkg/docs github.com/machadovilaca/operator-observability/pkg/docs
github.com/machadovilaca/operator-observability/pkg/operatormetrics github.com/machadovilaca/operator-observability/pkg/operatormetrics
github.com/machadovilaca/operator-observability/pkg/operatorrules github.com/machadovilaca/operator-observability/pkg/operatorrules