Update metric names to fit metrics naming conventions This fix updated metrics names to meet the metrics naming conventions. The old metrics names will not be available after this fix. (#2850)

Signed-off-by: Aviv Litman <alitman@redhat.com>
Co-authored-by: Aviv Litman <alitman@redhat.com>
This commit is contained in:
kubevirt-bot 2023-08-17 23:39:34 +02:00 committed by GitHub
parent 1b6f7ced4b
commit efbaa78054
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 47 additions and 49 deletions

View File

@ -5,22 +5,22 @@ All metrics documented here are auto-generated by the utility tool `tools/metric
## Containerized Data Importer Metrics List
### clone_progress
The clone progress in percentage. Type: Counter.
### kubevirt_cdi_clone_dv_unusual_restartcount_total
Total restart count in CDI Data Volume cloner pod. Type: Counter.
### kubevirt_cdi_clone_pods_high_restart
The number of CDI clone pods with high restart count. Type: Gauge.
### kubevirt_cdi_cr_ready
CDI CR Ready. Type: Gauge.
CDI install ready. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated
DataImportCron has an outdated import. Type: Gauge.
### kubevirt_cdi_dataimportcron_outdated_total
Total count of outdated DataImportCron imports. Type: Counter.
### kubevirt_cdi_import_dv_unusual_restartcount_total
Total restart count in CDI Data Volume importer pod. Type: Counter.
### kubevirt_cdi_incomplete_storageprofiles_total
### kubevirt_cdi_dataimportcron_outdated_aggregated
Total count of outdated DataImportCron imports. Type: Gauge.
### kubevirt_cdi_import_pods_high_restart
The number of CDI import pods with high restart count. Type: Gauge.
### kubevirt_cdi_incomplete_storageprofiles
Total number of incomplete and hence unusable StorageProfile. Type: Gauge.
### kubevirt_cdi_operator_up_total
### kubevirt_cdi_operator_up
CDI operator status. Type: Gauge.
### kubevirt_cdi_upload_dv_unusual_restartcount_total
Total restart count in CDI Data Volume upload server pod. Type: Counter.
### kubevirt_cdi_upload_pods_high_restart
The number of CDI upload server pods with high restart count. Type: Gauge.
## Developing new metrics
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.

View File

@ -44,13 +44,13 @@ var MetricOptsList = map[MetricsKey]MetricOpts{
Type: "Gauge",
},
IncompleteProfile: {
Name: "kubevirt_cdi_incomplete_storageprofiles_total",
Name: "kubevirt_cdi_incomplete_storageprofiles",
Help: "Total number of incomplete and hence unusable StorageProfile",
Type: "Gauge",
},
ReadyGauge: {
Name: "kubevirt_cdi_cr_ready",
Help: "CDI CR Ready",
Help: "CDI install ready",
Type: "Gauge",
},
}
@ -60,7 +60,7 @@ func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
return []RecordRulesDesc{
{
MetricOpts{
"kubevirt_cdi_operator_up_total",
"kubevirt_cdi_operator_up",
"CDI operator status",
"Gauge",
},
@ -68,33 +68,33 @@ func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
},
{
MetricOpts{
"kubevirt_cdi_import_dv_unusual_restartcount_total",
"Total restart count in CDI Data Volume importer pod",
"Counter",
"kubevirt_cdi_import_pods_high_restart",
"The number of CDI import pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_upload_dv_unusual_restartcount_total",
"Total restart count in CDI Data Volume upload server pod",
"Counter",
"kubevirt_cdi_upload_pods_high_restart",
"The number of CDI upload server pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_clone_dv_unusual_restartcount_total",
"Total restart count in CDI Data Volume cloner pod",
"Counter",
"kubevirt_cdi_clone_pods_high_restart",
"The number of CDI clone pods with high restart count",
"Gauge",
},
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
},
{
MetricOpts{
"kubevirt_cdi_dataimportcron_outdated_total",
"kubevirt_cdi_dataimportcron_outdated_aggregated",
"Total count of outdated DataImportCron imports",
"Counter",
"Gauge",
},
"sum(kubevirt_cdi_dataimportcron_outdated or vector(0))",
},

View File

@ -297,7 +297,7 @@ var _ = Describe("Controller", func() {
rule = obj.(*promv1.PrometheusRule)
cdiDownAlert := promv1.Rule{
Alert: "CDIOperatorDown",
Expr: intstr.FromString("kubevirt_cdi_operator_up_total == 0"),
Expr: intstr.FromString("kubevirt_cdi_operator_up == 0"),
For: "5m",
Annotations: map[string]string{
"summary": "CDI operator is down",

View File

@ -147,7 +147,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
return []promv1.Rule{
generateAlertRule(
"CDIOperatorDown",
"kubevirt_cdi_operator_up_total == 0",
"kubevirt_cdi_operator_up == 0",
"5m",
map[string]string{
"summary": "CDI operator is down",
@ -177,10 +177,10 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
),
generateAlertRule(
"CDIDataVolumeUnusualRestartCount",
"kubevirt_cdi_import_dv_unusual_restartcount_total > 0 or kubevirt_cdi_upload_dv_unusual_restartcount_total > 0 or kubevirt_cdi_clone_dv_unusual_restartcount_total > 0",
"kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0",
"5m",
map[string]string{
"summary": "Cluster has DataVolumes (PVC population request) with an unusual restart count, meaning they are probably failing and need to be investigated",
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"),
},
map[string]string{
@ -192,7 +192,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
),
generateAlertRule(
"CDIStorageProfilesIncomplete",
"kubevirt_cdi_incomplete_storageprofiles_total > 0",
"kubevirt_cdi_incomplete_storageprofiles > 0",
"5m",
map[string]string{
"summary": "Incomplete StorageProfiles exist, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
@ -207,7 +207,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
),
generateAlertRule(
"CDIDataImportCronOutdated",
"kubevirt_cdi_dataimportcron_outdated_total > 0",
"kubevirt_cdi_dataimportcron_outdated_aggregated > 0",
"15m",
map[string]string{
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",

View File

@ -47,7 +47,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
waitForIncompleteMetricInitialization := func() {
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
}, 2*time.Minute, 1*time.Second).ShouldNot(Equal(-1))
}
@ -60,7 +60,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
cdiPods = getCDIPods(f)
waitForIncompleteMetricInitialization()
originalMetricVal = getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
originalMetricVal = getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
})
AfterEach(func() {
@ -82,7 +82,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
}
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
}, 5*time.Minute, 5*time.Second).Should(BeNumerically("==", originalMetricVal))
})
@ -150,7 +150,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
expectedIncomplete := originalMetricVal + numAddedStorageClasses
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedIncomplete))
By("Check that the CDIStorageProfilesIncomplete alert is triggered")
@ -167,7 +167,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
Expect(err).ToNot(HaveOccurred())
expectedIncomplete--
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedIncomplete))
}
})
@ -185,13 +185,13 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
By("Metric stays the same because we don't support this provisioner")
Consistently(func() int {
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
}, metricPollingTimeout, metricPollingInterval).Should(Equal(originalMetricVal))
})
It("[test_id:7964] DataImportCron failing metric expected value when patching DesiredDigest annotation with junk sha256 value", func() {
numCrons := 2
originalCronMetricVal := getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
originalCronMetricVal := getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
expectedFailingCrons := originalCronMetricVal + numCrons
reg, err := getDataVolumeSourceRegistry(f)
@ -227,7 +227,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
}, dataImportCronTimeout, pollingInterval).Should(BeNil())
By(fmt.Sprintf("Ensuring metric value incremented to %d", originalCronMetricVal+i))
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", originalCronMetricVal+i))
}
By("Ensure metric value decrements when crons are cleaned up")
@ -235,7 +235,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
err = f.CdiClient.CdiV1beta1().DataImportCrons(f.Namespace.Name).Delete(context.TODO(), fmt.Sprintf("cron-test-%d", i), metav1.DeleteOptions{})
Expect(err).ToNot(HaveOccurred())
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedFailingCrons-i))
}
})
@ -250,9 +250,9 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
return dep.Status.Replicas == 0
}, 20*time.Second, 1*time.Second).Should(BeTrue())
By("Waiting for kubevirt_cdi_operator_up_total metric to be 0")
By("Waiting for kubevirt_cdi_operator_up metric to be 0")
Eventually(func() int {
return getMetricValue(f, "kubevirt_cdi_operator_up_total")
return getMetricValue(f, "kubevirt_cdi_operator_up")
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", 0))
By("Waiting for CDIOperatorDown alert to be triggered")
@ -303,7 +303,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
func dataVolumeUnusualRestartTest(f *framework.Framework) {
By("Test metric for unusual restart count")
Eventually(func() bool {
return getMetricValue(f, "kubevirt_cdi_import_dv_unusual_restartcount_total") == 1
return getMetricValue(f, "kubevirt_cdi_import_pods_high_restart") == 1
}, 2*time.Minute, 1*time.Second).Should(BeTrue())
By("checking that the CDIDataVolumeUnusualRestartCount alert is triggered")

View File

@ -26,13 +26,11 @@ import (
dto "github.com/prometheus/client_model/go"
)
// excludedMetrics defines the metrics to ignore,
// open issue:https://github.com/kubevirt/containerized-data-importer/issues/2773
// Do not add metrics to this list!
// This should be used only for very rare cases where the naming conventions that are explained in the best practices:
// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines
// should be ignored, open issue: https://github.com/kubevirt/containerized-data-importer/issues/2515.
var excludedMetrics = map[string]struct{}{
"clone_progress": {},
"kubevirt_cdi_operator_up_total": {},
"kubevirt_cdi_incomplete_storageprofiles_total": {},
"clone_progress": {},
}
func recordRulesDescToMetricList(mdl []monitoring.RecordRulesDesc) []monitoring.MetricOpts {