containerized-data-importer/pkg/monitoring/rules/alerts/operator.go
Aviv Litman 42ec627e35
Refactor recording-rules and alerts code (#3068)
* Refactor recording-rules and alerts code

Signed-off-by: avlitman <alitman@redhat.com>

* Remove promv1 from schema

Signed-off-by: avlitman <alitman@redhat.com>

---------

Signed-off-by: avlitman <alitman@redhat.com>
2024-02-18 16:05:42 +01:00

110 lines
4.0 KiB
Go

package alerts
import (
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/ptr"
)
var operatorAlerts = []promv1.Rule{
{
Alert: "CDIOperatorDown",
Expr: intstr.FromString("kubevirt_cdi_operator_up == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI operator is down",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDINotReady",
Expr: intstr.FromString("kubevirt_cdi_cr_ready == 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "CDI is not available to use",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "critical",
},
},
{
Alert: "CDIDataVolumeUnusualRestartCount",
Expr: intstr.FromString("kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0"),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIStorageProfilesIncomplete",
Expr: intstr.FromString(`sum by(storageclass,provisioner) ((kubevirt_cdi_storageprofile_info{complete="false"}>0))`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Incomplete StorageProfile {{ $labels.storageclass }}, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDIDataImportCronOutdated",
Expr: intstr.FromString("sum by(ns,cron_name) (kubevirt_cdi_dataimportcron_outdated) > 0"),
For: (*promv1.Duration)(ptr.To("15m")),
Annotations: map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
},
Labels: map[string]string{
severityAlertLabelKey: "info",
operatorHealthImpactLabelKey: "warning",
},
},
{
Alert: "CDINoDefaultStorageClass",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) +
(count(kubevirt_cdi_datavolume_pending == 0) or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "No default StorageClass or virtualization StorageClass, and a DataVolume is pending for one",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIMultipleDefaultVirtStorageClasses",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{virtdefault="true"} or on() vector(0)) > 1`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "More than one default virtualization StorageClass detected",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
{
Alert: "CDIDefaultStorageClassDegraded",
Expr: intstr.FromString(`sum(kubevirt_cdi_storageprofile_info{default="true",rwx="true",smartclone="true"} or on() vector(0)) +
sum(kubevirt_cdi_storageprofile_info{virtdefault="true",rwx="true",smartclone="true"} or on() vector(0)) == 0`),
For: (*promv1.Duration)(ptr.To("5m")),
Annotations: map[string]string{
"summary": "Default storage class has no smart clone or ReadWriteMany",
},
Labels: map[string]string{
severityAlertLabelKey: "warning",
operatorHealthImpactLabelKey: "none",
},
},
}