mirror of
https://github.com/kubevirt/containerized-data-importer.git
synced 2025-06-03 06:30:22 +00:00
Update metric names to fit metrics naming conventions This fix updated metrics names to meet the metrics naming conventions. The old metrics names will not be available after this fix. (#2850)
Signed-off-by: Aviv Litman <alitman@redhat.com> Co-authored-by: Aviv Litman <alitman@redhat.com>
This commit is contained in:
parent
1b6f7ced4b
commit
efbaa78054
@ -5,22 +5,22 @@ All metrics documented here are auto-generated by the utility tool `tools/metric
|
||||
## Containerized Data Importer Metrics List
|
||||
### clone_progress
|
||||
The clone progress in percentage. Type: Counter.
|
||||
### kubevirt_cdi_clone_dv_unusual_restartcount_total
|
||||
Total restart count in CDI Data Volume cloner pod. Type: Counter.
|
||||
### kubevirt_cdi_clone_pods_high_restart
|
||||
The number of CDI clone pods with high restart count. Type: Gauge.
|
||||
### kubevirt_cdi_cr_ready
|
||||
CDI CR Ready. Type: Gauge.
|
||||
CDI install ready. Type: Gauge.
|
||||
### kubevirt_cdi_dataimportcron_outdated
|
||||
DataImportCron has an outdated import. Type: Gauge.
|
||||
### kubevirt_cdi_dataimportcron_outdated_total
|
||||
Total count of outdated DataImportCron imports. Type: Counter.
|
||||
### kubevirt_cdi_import_dv_unusual_restartcount_total
|
||||
Total restart count in CDI Data Volume importer pod. Type: Counter.
|
||||
### kubevirt_cdi_incomplete_storageprofiles_total
|
||||
### kubevirt_cdi_dataimportcron_outdated_aggregated
|
||||
Total count of outdated DataImportCron imports. Type: Gauge.
|
||||
### kubevirt_cdi_import_pods_high_restart
|
||||
The number of CDI import pods with high restart count. Type: Gauge.
|
||||
### kubevirt_cdi_incomplete_storageprofiles
|
||||
Total number of incomplete and hence unusable StorageProfile. Type: Gauge.
|
||||
### kubevirt_cdi_operator_up_total
|
||||
### kubevirt_cdi_operator_up
|
||||
CDI operator status. Type: Gauge.
|
||||
### kubevirt_cdi_upload_dv_unusual_restartcount_total
|
||||
Total restart count in CDI Data Volume upload server pod. Type: Counter.
|
||||
### kubevirt_cdi_upload_pods_high_restart
|
||||
The number of CDI upload server pods with high restart count. Type: Gauge.
|
||||
## Developing new metrics
|
||||
After developing new metrics or changing old ones, please run `make generate-doc` to regenerate this document.
|
||||
|
||||
|
@ -44,13 +44,13 @@ var MetricOptsList = map[MetricsKey]MetricOpts{
|
||||
Type: "Gauge",
|
||||
},
|
||||
IncompleteProfile: {
|
||||
Name: "kubevirt_cdi_incomplete_storageprofiles_total",
|
||||
Name: "kubevirt_cdi_incomplete_storageprofiles",
|
||||
Help: "Total number of incomplete and hence unusable StorageProfile",
|
||||
Type: "Gauge",
|
||||
},
|
||||
ReadyGauge: {
|
||||
Name: "kubevirt_cdi_cr_ready",
|
||||
Help: "CDI CR Ready",
|
||||
Help: "CDI install ready",
|
||||
Type: "Gauge",
|
||||
},
|
||||
}
|
||||
@ -60,7 +60,7 @@ func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
|
||||
return []RecordRulesDesc{
|
||||
{
|
||||
MetricOpts{
|
||||
"kubevirt_cdi_operator_up_total",
|
||||
"kubevirt_cdi_operator_up",
|
||||
"CDI operator status",
|
||||
"Gauge",
|
||||
},
|
||||
@ -68,33 +68,33 @@ func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
|
||||
},
|
||||
{
|
||||
MetricOpts{
|
||||
"kubevirt_cdi_import_dv_unusual_restartcount_total",
|
||||
"Total restart count in CDI Data Volume importer pod",
|
||||
"Counter",
|
||||
"kubevirt_cdi_import_pods_high_restart",
|
||||
"The number of CDI import pods with high restart count",
|
||||
"Gauge",
|
||||
},
|
||||
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
||||
},
|
||||
{
|
||||
MetricOpts{
|
||||
"kubevirt_cdi_upload_dv_unusual_restartcount_total",
|
||||
"Total restart count in CDI Data Volume upload server pod",
|
||||
"Counter",
|
||||
"kubevirt_cdi_upload_pods_high_restart",
|
||||
"The number of CDI upload server pods with high restart count",
|
||||
"Gauge",
|
||||
},
|
||||
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
||||
},
|
||||
{
|
||||
MetricOpts{
|
||||
"kubevirt_cdi_clone_dv_unusual_restartcount_total",
|
||||
"Total restart count in CDI Data Volume cloner pod",
|
||||
"Counter",
|
||||
"kubevirt_cdi_clone_pods_high_restart",
|
||||
"The number of CDI clone pods with high restart count",
|
||||
"Gauge",
|
||||
},
|
||||
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
||||
},
|
||||
{
|
||||
MetricOpts{
|
||||
"kubevirt_cdi_dataimportcron_outdated_total",
|
||||
"kubevirt_cdi_dataimportcron_outdated_aggregated",
|
||||
"Total count of outdated DataImportCron imports",
|
||||
"Counter",
|
||||
"Gauge",
|
||||
},
|
||||
"sum(kubevirt_cdi_dataimportcron_outdated or vector(0))",
|
||||
},
|
||||
|
@ -297,7 +297,7 @@ var _ = Describe("Controller", func() {
|
||||
rule = obj.(*promv1.PrometheusRule)
|
||||
cdiDownAlert := promv1.Rule{
|
||||
Alert: "CDIOperatorDown",
|
||||
Expr: intstr.FromString("kubevirt_cdi_operator_up_total == 0"),
|
||||
Expr: intstr.FromString("kubevirt_cdi_operator_up == 0"),
|
||||
For: "5m",
|
||||
Annotations: map[string]string{
|
||||
"summary": "CDI operator is down",
|
||||
|
@ -147,7 +147,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
|
||||
return []promv1.Rule{
|
||||
generateAlertRule(
|
||||
"CDIOperatorDown",
|
||||
"kubevirt_cdi_operator_up_total == 0",
|
||||
"kubevirt_cdi_operator_up == 0",
|
||||
"5m",
|
||||
map[string]string{
|
||||
"summary": "CDI operator is down",
|
||||
@ -177,10 +177,10 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
|
||||
),
|
||||
generateAlertRule(
|
||||
"CDIDataVolumeUnusualRestartCount",
|
||||
"kubevirt_cdi_import_dv_unusual_restartcount_total > 0 or kubevirt_cdi_upload_dv_unusual_restartcount_total > 0 or kubevirt_cdi_clone_dv_unusual_restartcount_total > 0",
|
||||
"kubevirt_cdi_import_pods_high_restart > 0 or kubevirt_cdi_upload_pods_high_restart > 0 or kubevirt_cdi_clone_pods_high_restart > 0",
|
||||
"5m",
|
||||
map[string]string{
|
||||
"summary": "Cluster has DataVolumes (PVC population request) with an unusual restart count, meaning they are probably failing and need to be investigated",
|
||||
"summary": "Some CDI population workloads have an unusual restart count, meaning they are probably failing and need to be investigated",
|
||||
"runbook_url": fmt.Sprintf(runbookURLTemplate, "CDIDataVolumeUnusualRestartCount"),
|
||||
},
|
||||
map[string]string{
|
||||
@ -192,7 +192,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
|
||||
),
|
||||
generateAlertRule(
|
||||
"CDIStorageProfilesIncomplete",
|
||||
"kubevirt_cdi_incomplete_storageprofiles_total > 0",
|
||||
"kubevirt_cdi_incomplete_storageprofiles > 0",
|
||||
"5m",
|
||||
map[string]string{
|
||||
"summary": "Incomplete StorageProfiles exist, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
|
||||
@ -207,7 +207,7 @@ func getAlertRules(runbookURLTemplate string) []promv1.Rule {
|
||||
),
|
||||
generateAlertRule(
|
||||
"CDIDataImportCronOutdated",
|
||||
"kubevirt_cdi_dataimportcron_outdated_total > 0",
|
||||
"kubevirt_cdi_dataimportcron_outdated_aggregated > 0",
|
||||
"15m",
|
||||
map[string]string{
|
||||
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
|
||||
|
@ -47,7 +47,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
|
||||
waitForIncompleteMetricInitialization := func() {
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
}, 2*time.Minute, 1*time.Second).ShouldNot(Equal(-1))
|
||||
}
|
||||
|
||||
@ -60,7 +60,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
cdiPods = getCDIPods(f)
|
||||
|
||||
waitForIncompleteMetricInitialization()
|
||||
originalMetricVal = getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
originalMetricVal = getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
})
|
||||
|
||||
AfterEach(func() {
|
||||
@ -82,7 +82,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
}
|
||||
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
}, 5*time.Minute, 5*time.Second).Should(BeNumerically("==", originalMetricVal))
|
||||
})
|
||||
|
||||
@ -150,7 +150,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
|
||||
expectedIncomplete := originalMetricVal + numAddedStorageClasses
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedIncomplete))
|
||||
|
||||
By("Check that the CDIStorageProfilesIncomplete alert is triggered")
|
||||
@ -167,7 +167,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
expectedIncomplete--
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedIncomplete))
|
||||
}
|
||||
})
|
||||
@ -185,13 +185,13 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
|
||||
By("Metric stays the same because we don't support this provisioner")
|
||||
Consistently(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_incomplete_storageprofiles")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(Equal(originalMetricVal))
|
||||
})
|
||||
|
||||
It("[test_id:7964] DataImportCron failing metric expected value when patching DesiredDigest annotation with junk sha256 value", func() {
|
||||
numCrons := 2
|
||||
originalCronMetricVal := getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
|
||||
originalCronMetricVal := getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
|
||||
expectedFailingCrons := originalCronMetricVal + numCrons
|
||||
|
||||
reg, err := getDataVolumeSourceRegistry(f)
|
||||
@ -227,7 +227,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
}, dataImportCronTimeout, pollingInterval).Should(BeNil())
|
||||
By(fmt.Sprintf("Ensuring metric value incremented to %d", originalCronMetricVal+i))
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", originalCronMetricVal+i))
|
||||
}
|
||||
By("Ensure metric value decrements when crons are cleaned up")
|
||||
@ -235,7 +235,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
err = f.CdiClient.CdiV1beta1().DataImportCrons(f.Namespace.Name).Delete(context.TODO(), fmt.Sprintf("cron-test-%d", i), metav1.DeleteOptions{})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_dataimportcron_outdated_aggregated")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", expectedFailingCrons-i))
|
||||
}
|
||||
})
|
||||
@ -250,9 +250,9 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
return dep.Status.Replicas == 0
|
||||
}, 20*time.Second, 1*time.Second).Should(BeTrue())
|
||||
|
||||
By("Waiting for kubevirt_cdi_operator_up_total metric to be 0")
|
||||
By("Waiting for kubevirt_cdi_operator_up metric to be 0")
|
||||
Eventually(func() int {
|
||||
return getMetricValue(f, "kubevirt_cdi_operator_up_total")
|
||||
return getMetricValue(f, "kubevirt_cdi_operator_up")
|
||||
}, metricPollingTimeout, metricPollingInterval).Should(BeNumerically("==", 0))
|
||||
|
||||
By("Waiting for CDIOperatorDown alert to be triggered")
|
||||
@ -303,7 +303,7 @@ var _ = Describe("[Destructive] Monitoring Tests", func() {
|
||||
func dataVolumeUnusualRestartTest(f *framework.Framework) {
|
||||
By("Test metric for unusual restart count")
|
||||
Eventually(func() bool {
|
||||
return getMetricValue(f, "kubevirt_cdi_import_dv_unusual_restartcount_total") == 1
|
||||
return getMetricValue(f, "kubevirt_cdi_import_pods_high_restart") == 1
|
||||
}, 2*time.Minute, 1*time.Second).Should(BeTrue())
|
||||
|
||||
By("checking that the CDIDataVolumeUnusualRestartCount alert is triggered")
|
||||
|
@ -26,13 +26,11 @@ import (
|
||||
dto "github.com/prometheus/client_model/go"
|
||||
)
|
||||
|
||||
// excludedMetrics defines the metrics to ignore,
|
||||
// open issue:https://github.com/kubevirt/containerized-data-importer/issues/2773
|
||||
// Do not add metrics to this list!
|
||||
// This should be used only for very rare cases where the naming conventions that are explained in the best practices:
|
||||
// https://sdk.operatorframework.io/docs/best-practices/observability-best-practices/#metrics-guidelines
|
||||
// should be ignored, open issue: https://github.com/kubevirt/containerized-data-importer/issues/2515.
|
||||
var excludedMetrics = map[string]struct{}{
|
||||
"clone_progress": {},
|
||||
"kubevirt_cdi_operator_up_total": {},
|
||||
"kubevirt_cdi_incomplete_storageprofiles_total": {},
|
||||
"clone_progress": {},
|
||||
}
|
||||
|
||||
func recordRulesDescToMetricList(mdl []monitoring.RecordRulesDesc) []monitoring.MetricOpts {
|
||||
|
Loading…
Reference in New Issue
Block a user