Refactor recording-rules and alerts code (#3068)

* Refactor recording-rules and alerts code

Signed-off-by: avlitman <alitman@redhat.com>

* Remove promv1 from schema

Signed-off-by: avlitman <alitman@redhat.com>

---------

Signed-off-by: avlitman <alitman@redhat.com>
This commit is contained in:
Aviv Litman 2024-02-18 17:05:42 +02:00 committed by GitHub
parent 24c9eb5706
commit 42ec627e35
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 635 additions and 401 deletions

View File

@ -156,7 +156,7 @@ openshift-ci-image-push:
./hack/build/osci-image-builder.sh
generate-doc: build-docgen
_out/pkg/monitoring/tools/metricsdocs/metricsdocs > doc/metrics.md
_out/tools/metricsdocs/metricsdocs > doc/metrics.md
bootstrap-ginkgo:
${DO_BAZ} ./hack/build/bootstrap-ginkgo.sh

View File

@ -1,5 +1,8 @@
# Containerized Data Importer metrics
### kubevirt_cdi_clone_pods_high_restart
The number of CDI clone pods with high restart count. Type: Gauge.
### kubevirt_cdi_cr_ready
CDI install ready. Type: Gauge.
@ -9,18 +12,15 @@ DataImportCron has an outdated import. Type: Gauge.
### kubevirt_cdi_datavolume_pending
Number of DataVolumes pending for default storage class to be configured. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_clone_pods_high_restart
The number of CDI clone pods with high restart count. Type: Gauge.
### kubevirt_cdi_import_pods_high_restart
The number of CDI import pods with high restart count. Type: Gauge.
### kubevirt_cdi_operator_up
CDI operator status. Type: Gauge.
### kubevirt_cdi_storageprofile_info
`StorageProfiles` info labels: `storageclass`, `provisioner`, `complete` indicates if all storage profiles recommended PVC settings are complete, `default` indicates if it's the Kubernetes default storage class, `virtdefault` indicates if it's the default virtualization storage class, `rwx` indicates if the storage class supports `ReadWriteMany`, `smartclone` indicates if it supports snapshot or CSI based clone. Type: Gauge.
### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge.

2
go.mod
View File

@ -23,7 +23,7 @@ require (
github.com/kubernetes-csi/external-snapshotter/client/v6 v6.0.1
github.com/kubernetes-csi/lib-volume-populator v1.2.1-0.20230316163120-b62a0eee2c56
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a
github.com/machadovilaca/operator-observability v0.0.9
github.com/machadovilaca/operator-observability v0.0.13
github.com/onsi/ginkgo/v2 v2.12.0
github.com/onsi/gomega v1.27.10
github.com/openshift/api v0.0.0-20240116035456-11ed2fbcb805

4
go.sum
View File

@ -1099,8 +1099,8 @@ github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4
github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a/go.mod h1:qGj2agzgwQ27nYhP3xhLs+IBzE5+ALNUg8bDfMcwPqo=
github.com/lyft/protoc-gen-star v0.6.0/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/lyft/protoc-gen-star v0.6.1/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/machadovilaca/operator-observability v0.0.9 h1:jL2jVh0YJNA3nSX216X74RDZiEPPvsgqXollYmMOQkg=
github.com/machadovilaca/operator-observability v0.0.9/go.mod h1:NGkaR3HEYLScVQf6kQAyxWOSN1ltHcsEvHU/8iIJ8cE=
github.com/machadovilaca/operator-observability v0.0.13 h1:9mhxEjkdE6pcl3ke8chbbAWxx25+K1m4Gq31yo+r2JU=
github.com/machadovilaca/operator-observability v0.0.13/go.mod h1:e4Z3VhOXb9InkmSh00JjqBBijE+iD+YMzynBpKB3+gE=
github.com/magiconair/properties v1.8.1/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mailru/easyjson v0.0.0-20180823135443-60711f1a8329/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=
github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc=

View File

@ -29,10 +29,10 @@ mkdir -p ${CMD_OUT_DIR}/dump
bazel build \
--verbose_failures \
--config=${ARCHITECTURE} \
//pkg/monitoring/tools/metricsdocs/...
//tools/metricsdocs/...
rm -rf _out/pkg/monitoring/tools/metricsdocs
mkdir -p _out/pkg/monitoring/tools/metricsdocs
cp ./bazel-bin/pkg/monitoring/tools/metricsdocs/metricsdocs_/metricsdocs _out/pkg/monitoring/tools/metricsdocs/
rm -rf _out/tools/metricsdocs
mkdir -p _out/tools/metricsdocs
cp ./bazel-bin/tools/metricsdocs/metricsdocs_/metricsdocs _out/tools/metricsdocs/
bazel clean

View File

@ -23,7 +23,7 @@ set -e
linter_image_tag="v0.0.1"
PROJECT_ROOT="$(readlink -e "$(dirname "${BASH_SOURCE[0]}")"/../../)"
export METRICS_COLLECTOR_PATH="${METRICS_COLLECTOR_PATH:-${PROJECT_ROOT}/pkg/monitoring/tools/prom-metrics-collector}"
export METRICS_COLLECTOR_PATH="${METRICS_COLLECTOR_PATH:-${PROJECT_ROOT}/tools/prom-metrics-collector}"
if [[ ! -d "$METRICS_COLLECTOR_PATH" ]]; then
echo "Invalid METRICS_COLLECTOR_PATH: $METRICS_COLLECTOR_PATH is not a valid directory path"

View File

@ -24,7 +24,7 @@ import (
"k8s.io/apimachinery/pkg/types"
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
"kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
metrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
)
const (

View File

@ -1,4 +1,4 @@
package metrics
package cdicontroller
import (
"github.com/prometheus/client_golang/prometheus"

View File

@ -1,4 +1,4 @@
package metrics
package cdicontroller
import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics
package cdicontroller
import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics
package cdicontroller
import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics
package operatorcontroller
import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -1,4 +1,4 @@
package metrics
package operatorcontroller
import (
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"

View File

@ -0,0 +1,15 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["rules.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules",
visibility = ["//visibility:public"],
deps = [
"//pkg/common:go_default_library",
"//pkg/monitoring/rules/alerts:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
],
)

View File

@ -0,0 +1,18 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = [
"operator.go",
"prometheus.go",
],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts",
visibility = ["//visibility:public"],
deps = [
"//pkg/common:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
"//vendor/k8s.io/utils/ptr:go_default_library",
],
)

View File

@ -0,0 +1,109 @@
package alerts
import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)
var operatorAlerts = []promv1.Rule{
{
Alert: "CDIOperatorDown",
Expr: intstr.FromString("kubevirt_cdi_operator_up == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI operator is down",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDINotReady",
Expr: intstr.FromString("kubevirt_cdi_cr_ready == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI is not available to use",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDIDataVolumeUnusualRestartCount",
Expr: intstr.FromString("kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIStorageProfilesIncomplete",
Expr: intstr.FromString(`sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIDataImportCronOutdated",
Expr: intstr.FromString("sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0"),
For: (*promv1.Duration)(ptr.To("15m")),
Annotations: map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDINoDefaultStorageClass",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) +
(count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIMultipleDefaultVirtStorageClasses",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "More than one default virtualization StorageClass detected",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIDefaultStorageClassDegraded",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
}

View File

@ -0,0 +1,58 @@
package alerts
import (
"errors"
"fmt"
"os"
"strings"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"kubevirt.io/containerized-data-importer/pkg/common"
)
const (
prometheusRunbookAnnotationKey = "runbook_url"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
operatorHealthImpactLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
componentAlertLabelKey = "kubernetes_operator_component"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelValue = common.CDILabelValue
)
// Register sets up alert rules in the given namespace.
func Register(namespace string) error {
alerts := [][]promv1.Rule{
operatorAlerts,
}
runbookURLTemplate := GetRunbookURLTemplate()
for _, alertGroup := range alerts {
for _, alert := range alertGroup {
alert.Labels[partOfAlertLabelKey] = partOfAlertLabelValue
alert.Labels[componentAlertLabelKey] = componentAlertLabelValue
alert.Annotations[prometheusRunbookAnnotationKey] = fmt.Sprintf(runbookURLTemplate, alert.Alert)
}
}
return operatorrules.RegisterAlerts(alerts...)
}
// GetRunbookURLTemplate fetches or defaults the runbook URL template.
func GetRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}
if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}
return runbookURLTemplate
}

View File

@ -2,8 +2,17 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
go_library(
name = "go_default_library",
srcs = ["recordingrules.go"],
srcs = [
"operator.go",
"pods.go",
"recordingrules.go",
],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules",
visibility = ["//visibility:public"],
deps = ["//pkg/common:go_default_library"],
deps = [
"//pkg/common:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatormetrics:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
],
)

View File

@ -0,0 +1,24 @@
package recordingrules
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"
)
func operatorRecordingRules(namespace string) []operatorrules.RecordingRule {
return []operatorrules.RecordingRule{
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_operator_up",
Help: "CDI operator status",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
),
},
}
}

View File

@ -0,0 +1,45 @@
package recordingrules
import (
"fmt"
"strconv"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
"k8s.io/apimachinery/pkg/util/intstr"
"kubevirt.io/containerized-data-importer/pkg/common"
)
var podsRecordingRules = []operatorrules.RecordingRule{
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_import_pods_high_restart",
Help: "The number of CDI import pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_upload_pods_high_restart",
Help: "The number of CDI upload server pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
{
MetricsOpts: operatormetrics.MetricOpts{
Name: "kubevirt_cdi_clone_pods_high_restart",
Help: "The number of CDI clone pods with high restart count",
},
MetricType: operatormetrics.GaugeType,
Expr: intstr.FromString(
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s) or on() vector(0)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
),
},
}

View File

@ -1,59 +1,11 @@
package recordingrules
import (
"fmt"
"strconv"
import "github.com/machadovilaca/operator-observability/pkg/operatorrules"
"kubevirt.io/containerized-data-importer/pkg/common"
)
// MetricOpts represent CDI Prometheus Metrics
type MetricOpts struct {
Name string
Help string
Type string
}
// RecordRulesDesc represent CDI Prometheus Record Rules
type RecordRulesDesc struct {
Opts MetricOpts
Expr string
}
// GetRecordRulesDesc returns CDI Prometheus Record Rules
func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
return []RecordRulesDesc{
{
MetricOpts{
"kubevirt_cdi_operator_up",
"CDI operator status",
"Gauge",
},
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
},
{
MetricOpts{
"kubevirt_cdi_import_pods_high_restart",
"The number of CDI import pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_upload_pods_high_restart",
"The number of CDI upload server pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s) or on() vector(0)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_clone_pods_high_restart",
"The number of CDI clone pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s) or on() vector(0)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
}
// Register sets up recording rules in the given namespace.
func Register(namespace string) error {
return operatorrules.RegisterRecordingRules(
operatorRecordingRules(namespace),
podsRecordingRules,
)
}

View File

@ -0,0 +1,49 @@
package rules
import (
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"kubevirt.io/containerized-data-importer/pkg/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
)
const (
ruleName = "prometheus-cdi-rules"
)
// SetupRules initializes recording and alert rules in a namespace.
func SetupRules(namespace string) error {
if err := recordingrules.Register(namespace); err != nil {
return err
}
if err := alerts.Register(namespace); err != nil {
return err
}
return nil
}
// BuildPrometheusRule creates a PrometheusRule in a namespace.
func BuildPrometheusRule(namespace string) (*promv1.PrometheusRule, error) {
return operatorrules.BuildPrometheusRule(
ruleName,
namespace,
map[string]string{
common.CDIComponentLabel: "",
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
)
}
// ListRecordingRules returns all configured recording rules.
func ListRecordingRules() []operatorrules.RecordingRule {
return operatorrules.ListRecordingRules()
}
// ListAlerts returns all configured alert rules.
func ListAlerts() []promv1.Rule {
return operatorrules.ListAlerts()
}

View File

@ -1,81 +0,0 @@
package main
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/docs"
"github.com/machadovilaca/operator-observability/pkg/operatormetrics"
"github.com/machadovilaca/operator-observability/pkg/operatorrules"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
)
const tpl = `# Containerized Data Importer metrics
{{- range . }}
{{ $deprecatedVersion := "" -}}
{{- with index .ExtraFields "DeprecatedVersion" -}}
{{- $deprecatedVersion = printf " in %s" . -}}
{{- end -}}
{{- $stabilityLevel := "" -}}
{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}}
{{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}}
{{- end -}}
### {{ .Name }}
{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}.
{{- end }}
## Developing new metrics
All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
`
func main() {
err := operatorMetrics.SetupMetrics()
if err != nil {
panic(err)
}
err = cdiMetrics.SetupMetrics()
if err != nil {
panic(err)
}
metricsList := operatorMetrics.ListMetrics()
recordingRulesList := convertToRecordingRules(recordingrules.GetRecordRulesDesc(""))
docsString := docs.BuildMetricsDocsWithCustomTemplate(metricsList, recordingRulesList, tpl)
fmt.Print(docsString)
}
func convertToRecordingRules(recordRulesDesc []recordingrules.RecordRulesDesc) []operatorrules.RecordingRule {
var recordingRules []operatorrules.RecordingRule
for _, ruleDesc := range recordRulesDesc {
recordingRule := operatorrules.RecordingRule{
MetricsOpts: operatormetrics.MetricOpts{
Name: ruleDesc.Opts.Name,
Help: ruleDesc.Opts.Help,
// Assuming the rest of the fields are correctly mapped
},
MetricType: convertRulesType(ruleDesc.Opts.Type),
}
recordingRules = append(recordingRules, recordingRule)
}
return recordingRules
}
// when adding new recording rule please note that
func convertRulesType(metricType string) operatormetrics.MetricType {
if metricType == "Gauge" {
return operatormetrics.GaugeType
// ... other cases ...
}
return ""
}

View File

@ -24,7 +24,7 @@ go_library(
"//pkg/controller/common:go_default_library",
"//pkg/feature-gates:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//pkg/monitoring/rules:go_default_library",
"//pkg/operator:go_default_library",
"//pkg/operator/resources/cert:go_default_library",
"//pkg/operator/resources/cluster:go_default_library",
@ -59,7 +59,6 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/runtime:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/runtime/schema:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/intstr:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/strategicpatch:go_default_library",
"//vendor/k8s.io/apiserver/pkg/authentication/user:go_default_library",
"//vendor/k8s.io/client-go/kubernetes:go_default_library",
@ -94,6 +93,8 @@ go_test(
embed = [":go_default_library"],
deps = [
"//pkg/common:go_default_library",
"//pkg/monitoring/rules:go_default_library",
"//pkg/monitoring/rules/alerts:go_default_library",
"//pkg/operator/resources/cert:go_default_library",
"//pkg/operator/resources/cluster:go_default_library",
"//pkg/operator/resources/namespaced:go_default_library",

View File

@ -40,6 +40,7 @@ import (
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
metrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
"kubevirt.io/containerized-data-importer/pkg/operator"
cdicerts "kubevirt.io/containerized-data-importer/pkg/operator/resources/cert"
cdicluster "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster"
@ -129,6 +130,11 @@ func newReconciler(mgr manager.Manager) (*ReconcileCDI, error) {
return nil, err
}
err = rules.SetupRules(namespace)
if err != nil {
return nil, err
}
recorder := mgr.GetEventRecorderFor("operator-controller")
r := &ReconcileCDI{

View File

@ -25,9 +25,8 @@ import (
"strconv"
"strings"
"kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks"
sdkapi "kubevirt.io/controller-lifecycle-operator-sdk/api"
"kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/callbacks"
sdkr "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk/reconciler"
. "github.com/onsi/ginkgo/v2"
@ -57,6 +56,8 @@ import (
cdiv1 "kubevirt.io/containerized-data-importer-api/pkg/apis/core/v1beta1"
"kubevirt.io/containerized-data-importer/pkg/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/alerts"
clusterResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/cluster"
namespaceResources "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced"
utils "kubevirt.io/containerized-data-importer/pkg/operator/resources/utils"
@ -285,7 +286,7 @@ var _ = Describe("Controller", func() {
doReconcile(args)
Expect(setDeploymentsReady(args)).To(BeTrue())
runbookURLTemplate := getRunbookURLTemplate()
runbookURLTemplate := alerts.GetRunbookURLTemplate()
rule := &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
@ -313,7 +314,7 @@ var _ = Describe("Controller", func() {
},
}
Expect(rule.Spec.Groups[0].Rules).To(ContainElement(cdiDownAlert))
Expect(rule.Spec.Groups[1].Rules).To(ContainElement(cdiDownAlert))
Expect(rule.Labels[common.AppKubernetesPartOfLabel]).To(Equal("testing"))
validateEvents(args.reconciler, createReadyEventValidationMap())
})
@ -1737,6 +1738,11 @@ func createReconciler(client client.Client) *ReconcileCDI {
Namespace: namespace,
}
err := rules.SetupRules(namespace)
if err != nil {
panic(err)
}
recorder := record.NewFakeRecorder(250)
r := &ReconcileCDI{
client: client,

View File

@ -18,11 +18,9 @@ package controller
import (
"context"
"errors"
"fmt"
"os"
"reflect"
"strings"
"github.com/go-logr/logr"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
@ -31,14 +29,13 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/intstr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/source"
"kubevirt.io/containerized-data-importer/pkg/common"
cc "kubevirt.io/containerized-data-importer/pkg/controller/common"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
cdinamespaced "kubevirt.io/containerized-data-importer/pkg/operator/resources/namespaced"
"kubevirt.io/containerized-data-importer/pkg/util"
@ -52,12 +49,6 @@ const (
defaultMonitoringNs = "monitoring"
defaultRunbookURLTemplate = "https://kubevirt.io/monitoring/runbooks/%s"
runbookURLTemplateEnv = "RUNBOOK_URL_TEMPLATE"
severityAlertLabelKey = "severity"
healthImpactAlertLabelKey = "operator_health_impact"
partOfAlertLabelKey = "kubernetes_operator_part_of"
partOfAlertLabelValue = "kubevirt"
componentAlertLabelKey = "kubernetes_operator_component"
componentAlertLabelValue = common.CDILabelValue
)
func ensurePrometheusResourcesExist(ctx context.Context, c client.Client, scheme *runtime.Scheme, owner metav1.Object) error {
@ -134,166 +125,15 @@ func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string)
return true, nil
}
func getRecordRules(namespace string) []promv1.Rule {
var recordRules []promv1.Rule
for _, rrd := range recordingrules.GetRecordRulesDesc(namespace) {
recordRules = append(recordRules, generateRecordRule(rrd.Opts.Name, rrd.Expr))
}
return recordRules
}
func getAlertRules(runbookURLTemplate string) []promv1.Rule {
return []promv1.Rule{
generateAlertRule(
"CDIOperatorDown",
"kubevirt_cdi_operator_up == 0",
promv1.Duration("5m"),
map[string]string{
"summary": "CDI operator is down",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIOperatorDown"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINotReady",
"kubevirt_cdi_cr_ready == 0",
promv1.Duration("5m"),
map[string]string{
"summary": "CDI is not available to use",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINotReady"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "critical",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataVolumeUnusualRestartCount",
`kubevirt_cdi_import_pods_high_restart > 0 or
kubevirt_cdi_upload_pods_high_restart > 0 or
kubevirt_cdi_clone_pods_high_restart > 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIStorageProfilesIncomplete",
`sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`,
promv1.Duration("5m"),
map[string]string{
"summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIStorageProfilesIncomplete"),
},
map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDataImportCronOutdated",
`sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0`,
promv1.Duration("15m"),
map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataImportCronOutdated"),
},
map[string]string{
severityAlertLabelKey: "info",
healthImpactAlertLabelKey: "warning",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDINoDefaultStorageClass",
`sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) +
(count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDINoDefaultStorageClass"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIMultipleDefaultVirtStorageClasses",
`sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`,
promv1.Duration("5m"),
map[string]string{
"summary": "More than one default virtualization StorageClass detected",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIMultipleDefaultVirtStorageClasses"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
generateAlertRule(
"CDIDefaultStorageClassDegraded",
`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`,
promv1.Duration("5m"),
map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDefaultStorageClassDegraded"),
},
map[string]string{
severityAlertLabelKey: "warning",
healthImpactAlertLabelKey: "none",
partOfAlertLabelKey: partOfAlertLabelValue,
componentAlertLabelKey: componentAlertLabelValue,
},
),
}
}
func newPrometheusRule(namespace string) *promv1.PrometheusRule {
runbookURLTemplate := getRunbookURLTemplate()
promRule, err := rules.BuildPrometheusRule(namespace)
if err != nil {
panic(err)
}
return &promv1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: ruleName,
Namespace: namespace,
Labels: map[string]string{
common.CDIComponentLabel: "",
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cdi.rules",
Rules: append(getRecordRules(namespace), getAlertRules(runbookURLTemplate)...),
},
},
},
ObjectMeta: promRule.ObjectMeta,
Spec: promRule.Spec,
}
}
@ -381,23 +221,6 @@ func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor {
}
}
func generateAlertRule(alert, expr string, duration promv1.Duration, annotations, labels map[string]string) promv1.Rule {
return promv1.Rule{
Alert: alert,
Expr: intstr.FromString(expr),
For: &duration,
Annotations: annotations,
Labels: labels,
}
}
func generateRecordRule(record, expr string) promv1.Rule {
return promv1.Rule{
Record: record,
Expr: intstr.FromString(expr),
}
}
func (r *ReconcileCDI) watchPrometheusResources() error {
listObjs := []client.ObjectList{
&promv1.PrometheusRuleList{},
@ -439,16 +262,3 @@ func (r *ReconcileCDI) watchPrometheusResources() error {
return nil
}
func getRunbookURLTemplate() string {
runbookURLTemplate, exists := os.LookupEnv(runbookURLTemplateEnv)
if !exists {
runbookURLTemplate = defaultRunbookURLTemplate
}
if strings.Count(runbookURLTemplate, "%s") != 1 {
panic(errors.New("runbook URL template must have exactly 1 %s substring"))
}
return runbookURLTemplate
}

View File

@ -3,15 +3,13 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
go_library(
name = "go_default_library",
srcs = ["metricsdocs.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/tools/metricsdocs",
importpath = "kubevirt.io/containerized-data-importer/tools/metricsdocs",
visibility = ["//visibility:private"],
deps = [
"//pkg/monitoring/metrics/cdi-controller:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//pkg/monitoring/rules:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/docs:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatormetrics:go_default_library",
"//vendor/github.com/machadovilaca/operator-observability/pkg/operatorrules:go_default_library",
],
)

View File

@ -0,0 +1,56 @@
package main
import (
"fmt"
"github.com/machadovilaca/operator-observability/pkg/docs"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
)
const tpl = `# Containerized Data Importer metrics
{{- range . }}
{{ $deprecatedVersion := "" -}}
{{- with index .ExtraFields "DeprecatedVersion" -}}
{{- $deprecatedVersion = printf " in %s" . -}}
{{- end -}}
{{- $stabilityLevel := "" -}}
{{- if and (.ExtraFields.StabilityLevel) (ne .ExtraFields.StabilityLevel "STABLE") -}}
{{- $stabilityLevel = printf "[%s%s] " .ExtraFields.StabilityLevel $deprecatedVersion -}}
{{- end -}}
### {{ .Name }}
{{ print $stabilityLevel }}{{ .Help }}. Type: {{ .Type -}}.
{{- end }}
## Developing new metrics
All metrics documented here are auto-generated and reflect exactly what is being
exposed. After developing new metrics or changing old ones please regenerate
this document.
`
func main() {
err := operatorMetrics.SetupMetrics()
if err != nil {
panic(err)
}
err = cdiMetrics.SetupMetrics()
if err != nil {
panic(err)
}
if err := rules.SetupRules("test"); err != nil {
panic(err)
}
docsString := docs.BuildMetricsDocsWithCustomTemplate(operatorMetrics.ListMetrics(), rules.ListRecordingRules(), tpl)
fmt.Print(docsString)
}

View File

@ -3,12 +3,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_binary", "go_library")
go_library(
name = "go_default_library",
srcs = ["metrics_json_generator.go"],
importpath = "kubevirt.io/containerized-data-importer/pkg/monitoring/tools/prom-metrics-collector",
importpath = "kubevirt.io/containerized-data-importer/tools/prom-metrics-collector",
visibility = ["//visibility:private"],
deps = [
"//pkg/monitoring/metrics/cdi-controller:go_default_library",
"//pkg/monitoring/metrics/operator-controller:go_default_library",
"//pkg/monitoring/rules/recordingrules:go_default_library",
"//pkg/monitoring/rules:go_default_library",
"//vendor/github.com/kubevirt/monitoring/pkg/metrics/parser:go_default_library",
],
)

View File

@ -8,7 +8,7 @@ import (
"github.com/kubevirt/monitoring/pkg/metrics/parser"
cdiMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/cdi-controller"
operatorMetrics "kubevirt.io/containerized-data-importer/pkg/monitoring/metrics/operator-controller"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules/recordingrules"
"kubevirt.io/containerized-data-importer/pkg/monitoring/rules"
)
// This should be used only for very rare cases where the naming conventions that are explained in the best practices:
@ -27,6 +27,10 @@ func main() {
panic(err)
}
if err := rules.SetupRules("test"); err != nil {
panic(err)
}
var metricFamilies []parser.Metric
metricsList := operatorMetrics.ListMetrics()
@ -40,13 +44,15 @@ func main() {
}
}
recordingRules := recordingrules.GetRecordRulesDesc("")
for _, r := range recordingRules {
metricFamilies = append(metricFamilies, parser.Metric{
Name: r.Opts.Name,
Help: r.Opts.Help,
Type: strings.ToUpper(r.Opts.Type),
})
rulesList := rules.ListRecordingRules()
for _, r := range rulesList {
if _, isExcludedMetric := excludedMetrics[r.GetOpts().Name]; !isExcludedMetric {
metricFamilies = append(metricFamilies, parser.Metric{
Name: r.GetOpts().Name,
Help: r.GetOpts().Help,
Type: strings.ToUpper(string(r.GetType())),
})
}
}
jsonBytes, err := json.Marshal(metricFamilies)

View File

@ -63,6 +63,8 @@ func BuildMetricsDocsWithCustomTemplate(
allDocs = append(allDocs, buildMetricsDocs(recordingRules)...)
}
sortMetricsDocs(allDocs)
buf := bytes.NewBufferString("")
err = tpl.Execute(buf, allDocs)
if err != nil {
@ -89,7 +91,6 @@ func buildMetricsDocs[T docOptions](items []T) []metricDocs {
ExtraFields: metricOpts.ExtraFields,
}
}
sortMetricsDocs(metricsDocs)
return metricsDocs
}

View File

@ -2,6 +2,7 @@ package operatormetrics
import (
"fmt"
"strings"
"github.com/prometheus/client_golang/prometheus"
)
@ -24,6 +25,16 @@ type CollectorResult struct {
Value float64
}
func (c Collector) hash() string {
var sb strings.Builder
for _, cm := range c.Metrics {
sb.WriteString(cm.GetOpts().Name)
}
return sb.String()
}
func (c Collector) Describe(ch chan<- *prometheus.Desc) {
for _, cm := range c.Metrics {
cm.getCollector().Describe(ch)

View File

@ -5,6 +5,10 @@ import (
)
type RegistryFunc func(c prometheus.Collector) error
type UnregisterFunc func(c prometheus.Collector) bool
// Register is the function used to register metrics and collectors by this package.
var Register RegistryFunc = prometheus.Register
// Unregister is the function used to unregister metrics and collectors by this package.
var Unregister UnregisterFunc = prometheus.Unregister

View File

@ -1,15 +1,24 @@
package operatormetrics
import (
"cmp"
"fmt"
"slices"
)
var operatorRegistry = newRegistry()
type operatorRegisterer struct {
registeredMetrics map[string]Metric
registeredMetrics map[string]Metric
registeredCollectors map[string]Collector
registeredCollectorMetrics map[string]Metric
}
func newRegistry() operatorRegisterer {
return operatorRegisterer{
registeredMetrics: map[string]Metric{},
registeredCollectors: map[string]Collector{},
registeredCollectorMetrics: map[string]Metric{},
}
}
@ -18,11 +27,17 @@ func newRegistry() operatorRegisterer {
func RegisterMetrics(allMetrics ...[]Metric) error {
for _, metricList := range allMetrics {
for _, metric := range metricList {
err := Register(metric.getCollector())
if metricExists(metric) {
err := unregisterMetric(metric)
if err != nil {
return err
}
}
err := registerMetric(metric)
if err != nil {
return err
}
operatorRegistry.registeredMetrics[metric.GetOpts().Name] = metric
}
}
@ -32,13 +47,16 @@ func RegisterMetrics(allMetrics ...[]Metric) error {
// RegisterCollector registers the collector with the Prometheus registry.
func RegisterCollector(collectors ...Collector) error {
for _, collector := range collectors {
err := Register(collector)
if err != nil {
return err
if collectorExists(collector) {
err := unregisterCollector(collector)
if err != nil {
return err
}
}
for _, cm := range collector.Metrics {
operatorRegistry.registeredCollectorMetrics[cm.GetOpts().Name] = cm
err := registerCollector(collector)
if err != nil {
return err
}
}
@ -57,5 +75,92 @@ func ListMetrics() []Metric {
result = append(result, rc)
}
slices.SortFunc(result, func(a, b Metric) int {
return cmp.Compare(a.GetOpts().Name, b.GetOpts().Name)
})
return result
}
// CleanRegistry removes all registered metrics.
func CleanRegistry() error {
for _, metric := range operatorRegistry.registeredMetrics {
err := unregisterMetric(metric)
if err != nil {
return err
}
}
for _, collector := range operatorRegistry.registeredCollectors {
err := unregisterCollector(collector)
if err != nil {
return err
}
}
return nil
}
func metricExists(metric Metric) bool {
_, ok := operatorRegistry.registeredMetrics[metric.GetOpts().Name]
return ok
}
func unregisterMetric(metric Metric) error {
if succeeded := Unregister(metric.getCollector()); succeeded {
delete(operatorRegistry.registeredMetrics, metric.GetOpts().Name)
return nil
}
return fmt.Errorf("failed to unregister from Prometheus client metric %s", metric.GetOpts().Name)
}
func registerMetric(metric Metric) error {
err := Register(metric.getCollector())
if err != nil {
return err
}
operatorRegistry.registeredMetrics[metric.GetOpts().Name] = metric
return nil
}
func collectorExists(collector Collector) bool {
_, ok := operatorRegistry.registeredCollectors[collector.hash()]
return ok
}
func unregisterCollector(collector Collector) error {
if succeeded := Unregister(collector); succeeded {
delete(operatorRegistry.registeredCollectors, collector.hash())
for _, metric := range collector.Metrics {
delete(operatorRegistry.registeredCollectorMetrics, metric.GetOpts().Name)
}
return nil
}
return fmt.Errorf("failed to unregister from Prometheus client collector with metrics: %s", buildCollectorMetricListString(collector))
}
func registerCollector(collector Collector) error {
err := Register(collector)
if err != nil {
return err
}
operatorRegistry.registeredCollectors[collector.hash()] = collector
for _, cm := range collector.Metrics {
operatorRegistry.registeredCollectorMetrics[cm.GetOpts().Name] = cm
}
return nil
}
func buildCollectorMetricListString(collector Collector) string {
metricsList := ""
for _, metric := range collector.Metrics {
metricsList += metric.GetOpts().Name + ", "
}
metricsList = metricsList[:len(metricsList)-2]
return metricsList
}

View File

@ -1,7 +1,9 @@
package operatorrules
import (
"cmp"
"fmt"
"slices"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@ -42,7 +44,7 @@ func buildPrometheusRuleSpec() (*promv1.PrometheusRuleSpec, error) {
if len(operatorRegistry.registeredAlerts) != 0 {
groups = append(groups, promv1.RuleGroup{
Name: "alerts.rules",
Rules: buildAlertsRules(),
Rules: ListAlerts(),
})
}
@ -63,11 +65,9 @@ func buildRecordingRulesRules() []promv1.Rule {
})
}
return rules
}
slices.SortFunc(rules, func(a, b promv1.Rule) int {
return cmp.Compare(a.Record, b.Record)
})
func buildAlertsRules() []promv1.Rule {
var rules []promv1.Rule
rules = append(rules, operatorRegistry.registeredAlerts...)
return rules
}

View File

@ -1,26 +1,32 @@
package operatorrules
import (
"cmp"
"slices"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
)
var operatorRegistry = newRegistry()
type operatorRegisterer struct {
registeredRecordingRules []RecordingRule
registeredAlerts []promv1.Rule
registeredRecordingRules map[string]RecordingRule
registeredAlerts map[string]promv1.Rule
}
func newRegistry() operatorRegisterer {
return operatorRegisterer{
registeredRecordingRules: []RecordingRule{},
registeredRecordingRules: map[string]RecordingRule{},
registeredAlerts: map[string]promv1.Rule{},
}
}
// RegisterRecordingRules registers the given recording rules.
func RegisterRecordingRules(recordingRules ...[]RecordingRule) error {
for _, recordingRuleList := range recordingRules {
operatorRegistry.registeredRecordingRules = append(operatorRegistry.registeredRecordingRules, recordingRuleList...)
for _, recordingRule := range recordingRuleList {
operatorRegistry.registeredRecordingRules[recordingRule.MetricsOpts.Name] = recordingRule
}
}
return nil
@ -29,7 +35,9 @@ func RegisterRecordingRules(recordingRules ...[]RecordingRule) error {
// RegisterAlerts registers the given alerts.
func RegisterAlerts(alerts ...[]promv1.Rule) error {
for _, alertList := range alerts {
operatorRegistry.registeredAlerts = append(operatorRegistry.registeredAlerts, alertList...)
for _, alert := range alertList {
operatorRegistry.registeredAlerts[alert.Alert] = alert
}
}
return nil
@ -37,10 +45,34 @@ func RegisterAlerts(alerts ...[]promv1.Rule) error {
// ListRecordingRules returns the registered recording rules.
func ListRecordingRules() []RecordingRule {
return operatorRegistry.registeredRecordingRules
var rules []RecordingRule
for _, rule := range operatorRegistry.registeredRecordingRules {
rules = append(rules, rule)
}
slices.SortFunc(rules, func(a, b RecordingRule) int {
return cmp.Compare(a.GetOpts().Name, b.GetOpts().Name)
})
return rules
}
// ListAlerts returns the registered alerts.
func ListAlerts() []promv1.Rule {
return operatorRegistry.registeredAlerts
var alerts []promv1.Rule
for _, alert := range operatorRegistry.registeredAlerts {
alerts = append(alerts, alert)
}
slices.SortFunc(alerts, func(a, b promv1.Rule) int {
return cmp.Compare(a.Alert, b.Alert)
})
return alerts
}
// CleanRegistry removes all registered rules and alerts.
func CleanRegistry() error {
operatorRegistry = newRegistry()
return nil
}

4
vendor/modules.txt vendored
View File

@ -357,8 +357,8 @@ github.com/kubernetes-csi/lib-volume-populator/populator-machinery
# github.com/kubevirt/monitoring/pkg/metrics/parser v0.0.0-20230627123556-81a891d4462a
## explicit; go 1.20
github.com/kubevirt/monitoring/pkg/metrics/parser
# github.com/machadovilaca/operator-observability v0.0.9
## explicit; go 1.20
# github.com/machadovilaca/operator-observability v0.0.13
## explicit; go 1.21
github.com/machadovilaca/operator-observability/pkg/docs
github.com/machadovilaca/operator-observability/pkg/operatormetrics
github.com/machadovilaca/operator-observability/pkg/operatorrules