mirror of
https://github.com/kubevirt/containerized-data-importer.git
synced 2025-06-03 06:30:22 +00:00

As of now OpenShift UI will not display the runbook URL field for our alerts (docs and google will though), so let's make sure we provide a better entry point by being verbose in what is actually visible to the user. Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>
429 lines
12 KiB
Go
429 lines
12 KiB
Go
/*
|
|
Copyright 2018 The CDI Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package controller
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"reflect"
|
|
"strconv"
|
|
|
|
promv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
|
|
"github.com/go-logr/logr"
|
|
rbacv1 "k8s.io/api/rbac/v1"
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
"k8s.io/apimachinery/pkg/util/intstr"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
|
|
"sigs.k8s.io/controller-runtime/pkg/source"
|
|
|
|
"kubevirt.io/containerized-data-importer/pkg/common"
|
|
"kubevirt.io/containerized-data-importer/pkg/controller"
|
|
"kubevirt.io/containerized-data-importer/pkg/util"
|
|
sdk "kubevirt.io/controller-lifecycle-operator-sdk/pkg/sdk"
|
|
)
|
|
|
|
const (
|
|
ruleName = "prometheus-cdi-rules"
|
|
rbacName = "cdi-monitoring"
|
|
monitorName = "service-monitor-cdi"
|
|
defaultMonitoringNs = "monitoring"
|
|
runbookURLBasePath = "https://kubevirt.io/monitoring/runbooks/"
|
|
severityAlertLabelKey = "severity"
|
|
partOfAlertLabelKey = "kubernetes_operator_part_of"
|
|
partOfAlertLabelValue = "kubevirt"
|
|
componentAlertLabelKey = "kubernetes_operator_component"
|
|
componentAlertLabelValue = common.CDILabelValue
|
|
)
|
|
|
|
func ensurePrometheusResourcesExist(c client.Client, scheme *runtime.Scheme, owner metav1.Object) error {
|
|
namespace := owner.GetNamespace()
|
|
|
|
cr, err := controller.GetActiveCDI(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if cr == nil {
|
|
return fmt.Errorf("no active CDI")
|
|
}
|
|
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
|
|
|
|
prometheusResources := []client.Object{
|
|
newPrometheusRule(namespace),
|
|
newPrometheusServiceMonitor(namespace),
|
|
newPrometheusRole(namespace),
|
|
newPrometheusRoleBinding(namespace),
|
|
}
|
|
|
|
for _, desired := range prometheusResources {
|
|
if err := sdk.SetLastAppliedConfiguration(desired, LastAppliedConfigAnnotation); err != nil {
|
|
return err
|
|
}
|
|
util.SetRecommendedLabels(desired, installerLabels, "cdi-operator")
|
|
if err := controllerutil.SetControllerReference(owner, desired, scheme); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.Create(context.TODO(), desired); err != nil {
|
|
if errors.IsAlreadyExists(err) {
|
|
current := sdk.NewDefaultInstance(desired)
|
|
nn := client.ObjectKeyFromObject(desired)
|
|
if err := c.Get(context.TODO(), nn, current); err != nil {
|
|
return err
|
|
}
|
|
current, err = sdk.StripStatusFromObject(current)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
currentObjCopy := current.DeepCopyObject()
|
|
sdk.MergeLabelsAndAnnotations(desired, current)
|
|
merged, err := sdk.MergeObject(desired, current, LastAppliedConfigAnnotation)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !reflect.DeepEqual(currentObjCopy, merged) {
|
|
if err := c.Update(context.TODO(), merged); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string) (bool, error) {
|
|
rule := &promv1.PrometheusRule{}
|
|
key := client.ObjectKey{Namespace: namespace, Name: ruleName}
|
|
if err := c.Get(context.TODO(), key, rule); err != nil {
|
|
if meta.IsNoMatchError(err) {
|
|
logger.V(3).Info("No match error for PrometheusRule, must not have prometheus deployed")
|
|
return false, nil
|
|
} else if !errors.IsNotFound(err) {
|
|
return false, err
|
|
}
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
// RecordRulesDesc represent CDI Prometheus Record Rules
|
|
type RecordRulesDesc struct {
|
|
Name string
|
|
Expr string
|
|
Description string
|
|
}
|
|
|
|
// GetRecordRulesDesc returns CDI Prometheus Record Rules
|
|
func GetRecordRulesDesc(namespace string) []RecordRulesDesc {
|
|
return []RecordRulesDesc{
|
|
{
|
|
"kubevirt_cdi_operator_up_total",
|
|
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
|
|
"CDI operator status",
|
|
},
|
|
{
|
|
"kubevirt_cdi_import_dv_unusual_restartcount_total",
|
|
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.ImporterPodName, common.ImporterPodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
|
"Total restart count in CDI Data Volume importer pod",
|
|
},
|
|
{
|
|
"kubevirt_cdi_upload_dv_unusual_restartcount_total",
|
|
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'%s-.*', container='%s'} > %s)", common.UploadPodName, common.UploadServerPodname, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
|
"Total restart count in CDI Data Volume upload server pod",
|
|
},
|
|
{
|
|
"kubevirt_cdi_clone_dv_unusual_restartcount_total",
|
|
fmt.Sprintf("count(kube_pod_container_status_restarts_total{pod=~'.*%s', container='%s'} > %s)", common.ClonerSourcePodNameSuffix, common.ClonerSourcePodName, strconv.Itoa(common.UnusualRestartCountThreshold)),
|
|
"Total restart count in CDI Data Volume cloner pod",
|
|
},
|
|
{
|
|
"kubevirt_cdi_dataimportcron_outdated_total",
|
|
"sum(kubevirt_cdi_dataimportcron_outdated or vector(0))",
|
|
"Total count of outdated DataImportCron imports",
|
|
},
|
|
}
|
|
}
|
|
|
|
func getRecordRules(namespace string) []promv1.Rule {
|
|
var recordRules []promv1.Rule
|
|
|
|
for _, rrd := range GetRecordRulesDesc(namespace) {
|
|
recordRules = append(recordRules, generateRecordRule(rrd.Name, rrd.Expr))
|
|
}
|
|
|
|
return recordRules
|
|
}
|
|
|
|
func getAlertRules() []promv1.Rule {
|
|
return []promv1.Rule{
|
|
generateAlertRule(
|
|
"CDIOperatorDown",
|
|
"kubevirt_cdi_operator_up_total == 0",
|
|
"5m",
|
|
map[string]string{
|
|
"summary": "CDI operator is down",
|
|
"runbook_url": runbookURLBasePath + "CDIOperatorDown",
|
|
},
|
|
map[string]string{
|
|
severityAlertLabelKey: "warning",
|
|
partOfAlertLabelKey: partOfAlertLabelValue,
|
|
componentAlertLabelKey: componentAlertLabelValue,
|
|
},
|
|
),
|
|
generateAlertRule(
|
|
"CDINotReady",
|
|
"kubevirt_cdi_cr_ready == 0",
|
|
"5m",
|
|
map[string]string{
|
|
"summary": "CDI is not available to use",
|
|
"runbook_url": runbookURLBasePath + "CDINotReady",
|
|
},
|
|
map[string]string{
|
|
severityAlertLabelKey: "warning",
|
|
partOfAlertLabelKey: partOfAlertLabelValue,
|
|
componentAlertLabelKey: componentAlertLabelValue,
|
|
},
|
|
),
|
|
generateAlertRule(
|
|
"CDIDataVolumeUnusualRestartCount",
|
|
"kubevirt_cdi_import_dv_unusual_restartcount_total > 0 or kubevirt_cdi_upload_dv_unusual_restartcount_total > 0 or kubevirt_cdi_clone_dv_unusual_restartcount_total > 0",
|
|
"5m",
|
|
map[string]string{
|
|
"summary": "Cluster has DataVolumes (PVC population request) with an unusual restart count, meaning they are probably failing and need to be investigated",
|
|
"runbook_url": runbookURLBasePath + "CDIDataVolumeUnusualRestartCount",
|
|
},
|
|
map[string]string{
|
|
severityAlertLabelKey: "warning",
|
|
partOfAlertLabelKey: partOfAlertLabelValue,
|
|
componentAlertLabelKey: componentAlertLabelValue,
|
|
},
|
|
),
|
|
generateAlertRule(
|
|
"CDIStorageProfilesIncomplete",
|
|
"kubevirt_cdi_incomplete_storageprofiles_total > 0",
|
|
"5m",
|
|
map[string]string{
|
|
"summary": "Incomplete StorageProfiles exist, accessMode/volumeMode cannot be inferred by CDI for PVC population request",
|
|
"runbook_url": runbookURLBasePath + "CDIStorageProfilesIncomplete",
|
|
},
|
|
map[string]string{
|
|
severityAlertLabelKey: "info",
|
|
partOfAlertLabelKey: partOfAlertLabelValue,
|
|
componentAlertLabelKey: componentAlertLabelValue,
|
|
},
|
|
),
|
|
generateAlertRule(
|
|
"CDIDataImportCronOutdated",
|
|
"kubevirt_cdi_dataimportcron_outdated_total > 0",
|
|
"15m",
|
|
map[string]string{
|
|
"summary": "DataImportCron (recurring polling of VM templates disk image sources, also known as golden images) PVCs are not being updated on the defined schedule",
|
|
"runbook_url": runbookURLBasePath + "CDIDataImportCronOutdated",
|
|
},
|
|
map[string]string{
|
|
severityAlertLabelKey: "info",
|
|
partOfAlertLabelKey: partOfAlertLabelValue,
|
|
componentAlertLabelKey: componentAlertLabelValue,
|
|
},
|
|
),
|
|
}
|
|
}
|
|
|
|
func newPrometheusRule(namespace string) *promv1.PrometheusRule {
|
|
return &promv1.PrometheusRule{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: ruleName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.CDIComponentLabel: "",
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Spec: promv1.PrometheusRuleSpec{
|
|
Groups: []promv1.RuleGroup{
|
|
{
|
|
Name: "cdi.rules",
|
|
Rules: append(getRecordRules(namespace), getAlertRules()...),
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func newPrometheusRole(namespace string) *rbacv1.Role {
|
|
return &rbacv1.Role{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: rbacName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.CDIComponentLabel: "",
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Rules: []rbacv1.PolicyRule{
|
|
{
|
|
APIGroups: []string{
|
|
"",
|
|
},
|
|
Resources: []string{
|
|
"services",
|
|
"endpoints",
|
|
"pods",
|
|
},
|
|
Verbs: []string{
|
|
"get", "list", "watch",
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func newPrometheusRoleBinding(namespace string) *rbacv1.RoleBinding {
|
|
monitoringNamespace := getMonitoringNamespace()
|
|
|
|
return &rbacv1.RoleBinding{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: rbacName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.CDIComponentLabel: "",
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
RoleRef: rbacv1.RoleRef{
|
|
APIGroup: "rbac.authorization.k8s.io",
|
|
Kind: "Role",
|
|
Name: rbacName,
|
|
},
|
|
Subjects: []rbacv1.Subject{
|
|
{
|
|
Kind: "ServiceAccount",
|
|
Namespace: monitoringNamespace,
|
|
Name: "prometheus-k8s",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func getMonitoringNamespace() string {
|
|
if ns := os.Getenv("MONITORING_NAMESPACE"); ns != "" {
|
|
return ns
|
|
}
|
|
|
|
return defaultMonitoringNs
|
|
}
|
|
|
|
func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor {
|
|
return &promv1.ServiceMonitor{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Namespace: namespace,
|
|
Name: monitorName,
|
|
Labels: map[string]string{
|
|
common.CDIComponentLabel: "",
|
|
"openshift.io/cluster-monitoring": "",
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Spec: promv1.ServiceMonitorSpec{
|
|
Selector: metav1.LabelSelector{
|
|
MatchLabels: map[string]string{
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
NamespaceSelector: promv1.NamespaceSelector{
|
|
MatchNames: []string{namespace},
|
|
},
|
|
Endpoints: []promv1.Endpoint{
|
|
{
|
|
Port: "metrics",
|
|
Scheme: "http",
|
|
TLSConfig: &promv1.TLSConfig{
|
|
InsecureSkipVerify: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func generateAlertRule(alert, expr, duration string, annotations, labels map[string]string) promv1.Rule {
|
|
return promv1.Rule{
|
|
Alert: alert,
|
|
Expr: intstr.FromString(expr),
|
|
For: duration,
|
|
Annotations: annotations,
|
|
Labels: labels,
|
|
}
|
|
}
|
|
|
|
func generateRecordRule(record, expr string) promv1.Rule {
|
|
return promv1.Rule{
|
|
Record: record,
|
|
Expr: intstr.FromString(expr),
|
|
}
|
|
}
|
|
|
|
func (r *ReconcileCDI) watchPrometheusResources() error {
|
|
listObjs := []client.ObjectList{
|
|
&promv1.PrometheusRuleList{},
|
|
&promv1.ServiceMonitorList{},
|
|
}
|
|
|
|
objs := []client.Object{
|
|
&promv1.PrometheusRule{},
|
|
&promv1.ServiceMonitor{},
|
|
}
|
|
|
|
for i, listObj := range listObjs {
|
|
obj := objs[i]
|
|
err := r.uncachedClient.List(context.TODO(), listObj, &client.ListOptions{
|
|
Namespace: util.GetNamespace(),
|
|
Limit: 1,
|
|
})
|
|
if err == nil {
|
|
if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil {
|
|
return err
|
|
}
|
|
} else if meta.IsNoMatchError(err) {
|
|
log.Info("Not watching", "type", fmt.Sprintf("%T", obj))
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
objs = []client.Object{
|
|
&rbacv1.Role{},
|
|
&rbacv1.RoleBinding{},
|
|
}
|
|
|
|
for _, obj := range objs {
|
|
if err := r.controller.Watch(&source.Kind{Type: obj}, enqueueCDI(r.client)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|