mirror of
https://github.com/kubevirt/containerized-data-importer.git
synced 2025-06-03 06:30:22 +00:00

* Deploy alerts infra as part of our installation Conditionally deploy the infrastructure that is needed to fire alerts for our users when bad things are happening to CDI. Testing with `KUBEVIRT_DEPLOY_PROMETHEUS=true` Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * Watch and unit test all prometheus related resources Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * add gateway for changing monitoring namespace (rbac purposes) Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * refactor test to check for exact alert name and firing state Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * Align pattern of ensuring prometheus resource exists for all Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * Remove potential noisy event Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * Extract duplicate code to function Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com> * Dont use empty value for prometheus label due to open issue https://github.com/prometheus-operator/prometheus-operator/issues/4325 Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>
415 lines
10 KiB
Go
415 lines
10 KiB
Go
/*
|
|
Copyright 2018 The CDI Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package controller
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"reflect"
|
|
|
|
promv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
|
|
"github.com/go-logr/logr"
|
|
rbacv1 "k8s.io/api/rbac/v1"
|
|
"k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/meta"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/intstr"
|
|
"kubevirt.io/containerized-data-importer/pkg/common"
|
|
"kubevirt.io/containerized-data-importer/pkg/controller"
|
|
"kubevirt.io/containerized-data-importer/pkg/util"
|
|
"sigs.k8s.io/controller-runtime/pkg/client"
|
|
"sigs.k8s.io/controller-runtime/pkg/source"
|
|
)
|
|
|
|
const (
|
|
ruleName = "prometheus-cdi-rules"
|
|
rbacName = "cdi-monitoring"
|
|
monitorName = "service-monitor-cdi"
|
|
defaultMonitoringNs = "monitoring"
|
|
)
|
|
|
|
func ensurePrometheusRuleExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
|
|
namespace := owner.GetNamespace()
|
|
if namespace == "" {
|
|
return fmt.Errorf("cluster scoped owner not supported")
|
|
}
|
|
|
|
cr, err := controller.GetActiveCDI(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if cr == nil {
|
|
return fmt.Errorf("no active CDI")
|
|
}
|
|
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
|
|
|
|
desiredRule := newPrometheusRule(namespace)
|
|
util.SetRecommendedLabels(desiredRule, installerLabels, "cdi-operator")
|
|
|
|
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
|
|
return err
|
|
} else if !deployed {
|
|
return nil
|
|
}
|
|
|
|
if err := c.Create(context.TODO(), desiredRule); err != nil {
|
|
if errors.IsAlreadyExists(err) {
|
|
currentRule := &promv1.PrometheusRule{}
|
|
key := client.ObjectKey{Namespace: namespace, Name: ruleName}
|
|
if err := c.Get(context.TODO(), key, currentRule); err != nil {
|
|
return err
|
|
}
|
|
if !reflect.DeepEqual(currentRule.Spec, desiredRule.Spec) {
|
|
currentRule.Spec = desiredRule.Spec
|
|
if err := c.Update(context.TODO(), currentRule); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func ensurePrometheusRbacExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
|
|
namespace := owner.GetNamespace()
|
|
if namespace == "" {
|
|
return fmt.Errorf("cluster scoped owner not supported")
|
|
}
|
|
|
|
cr, err := controller.GetActiveCDI(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if cr == nil {
|
|
return fmt.Errorf("no active CDI")
|
|
}
|
|
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
|
|
|
|
desiredRole := newPrometheusRole(namespace)
|
|
util.SetRecommendedLabels(desiredRole, installerLabels, "cdi-operator")
|
|
desiredRoleBinding := newPrometheusRoleBinding(namespace)
|
|
util.SetRecommendedLabels(desiredRoleBinding, installerLabels, "cdi-operator")
|
|
|
|
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
|
|
return err
|
|
} else if !deployed {
|
|
return nil
|
|
}
|
|
|
|
key := client.ObjectKey{Namespace: namespace, Name: rbacName}
|
|
if err := c.Create(context.TODO(), desiredRole); err != nil {
|
|
if errors.IsAlreadyExists(err) {
|
|
currentRole := &rbacv1.Role{}
|
|
if err := c.Get(context.TODO(), key, currentRole); err != nil {
|
|
return err
|
|
}
|
|
if !reflect.DeepEqual(currentRole.Rules, desiredRole.Rules) {
|
|
currentRole.Rules = desiredRole.Rules
|
|
if err := c.Update(context.TODO(), currentRole); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
if err := c.Create(context.TODO(), desiredRoleBinding); err != nil {
|
|
if errors.IsAlreadyExists(err) {
|
|
currentRoleBinding := &rbacv1.RoleBinding{}
|
|
if err := c.Get(context.TODO(), key, currentRoleBinding); err != nil {
|
|
return err
|
|
}
|
|
if !reflect.DeepEqual(currentRoleBinding.Subjects, desiredRoleBinding.Subjects) {
|
|
currentRoleBinding.Subjects = desiredRoleBinding.Subjects
|
|
if err := c.Update(context.TODO(), currentRoleBinding); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func ensurePrometheusServiceMonitorExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
|
|
namespace := owner.GetNamespace()
|
|
if namespace == "" {
|
|
return fmt.Errorf("cluster scoped owner not supported")
|
|
}
|
|
|
|
cr, err := controller.GetActiveCDI(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if cr == nil {
|
|
return fmt.Errorf("no active CDI")
|
|
}
|
|
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
|
|
|
|
desiredMonitor := newPrometheusServiceMonitor(namespace)
|
|
util.SetRecommendedLabels(desiredMonitor, installerLabels, "cdi-operator")
|
|
|
|
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
|
|
return err
|
|
} else if !deployed {
|
|
return nil
|
|
}
|
|
|
|
key := client.ObjectKey{Namespace: namespace, Name: monitorName}
|
|
if err := c.Create(context.TODO(), desiredMonitor); err != nil {
|
|
if errors.IsAlreadyExists(err) {
|
|
currentMonitor := &promv1.ServiceMonitor{}
|
|
if err := c.Get(context.TODO(), key, currentMonitor); err != nil {
|
|
return err
|
|
}
|
|
if !reflect.DeepEqual(currentMonitor.Spec, desiredMonitor.Spec) {
|
|
currentMonitor.Spec = desiredMonitor.Spec
|
|
if err := c.Update(context.TODO(), currentMonitor); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string) (bool, error) {
|
|
rule := &promv1.PrometheusRule{}
|
|
key := client.ObjectKey{Namespace: namespace, Name: ruleName}
|
|
if err := c.Get(context.TODO(), key, rule); err != nil {
|
|
if meta.IsNoMatchError(err) {
|
|
logger.V(3).Info("No match error for PrometheusRule, must not have prometheus deployed")
|
|
return false, nil
|
|
} else if !errors.IsNotFound(err) {
|
|
return false, err
|
|
}
|
|
}
|
|
|
|
return true, nil
|
|
}
|
|
|
|
func newPrometheusRule(namespace string) *promv1.PrometheusRule {
|
|
return &promv1.PrometheusRule{
|
|
TypeMeta: metav1.TypeMeta{
|
|
APIVersion: promv1.SchemeGroupVersion.String(),
|
|
Kind: "PrometheusRule",
|
|
},
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: ruleName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Spec: promv1.PrometheusRuleSpec{
|
|
Groups: []promv1.RuleGroup{
|
|
{
|
|
Name: "cdi.rules",
|
|
Rules: []promv1.Rule{
|
|
generateRecordRule(
|
|
"cdi_num_up_operators",
|
|
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
|
|
),
|
|
generateAlertRule(
|
|
"CdiOperatorDown",
|
|
"cdi_num_up_operators == 0",
|
|
"5m",
|
|
map[string]string{
|
|
"summary": "CDI operator is down",
|
|
},
|
|
map[string]string{
|
|
"severity": "warning",
|
|
},
|
|
),
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func newPrometheusRole(namespace string) *rbacv1.Role {
|
|
return &rbacv1.Role{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: rbacName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Rules: []rbacv1.PolicyRule{
|
|
{
|
|
APIGroups: []string{
|
|
"",
|
|
},
|
|
Resources: []string{
|
|
"services",
|
|
"endpoints",
|
|
"pods",
|
|
},
|
|
Verbs: []string{
|
|
"get", "list", "watch",
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func newPrometheusRoleBinding(namespace string) *rbacv1.RoleBinding {
|
|
monitoringNamespace := getMonitoringNamespace()
|
|
|
|
return &rbacv1.RoleBinding{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: rbacName,
|
|
Namespace: namespace,
|
|
Labels: map[string]string{
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
RoleRef: rbacv1.RoleRef{
|
|
APIGroup: "rbac.authorization.k8s.io",
|
|
Kind: "Role",
|
|
Name: rbacName,
|
|
},
|
|
Subjects: []rbacv1.Subject{
|
|
{
|
|
Kind: "ServiceAccount",
|
|
Namespace: monitoringNamespace,
|
|
Name: "prometheus-k8s",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func getMonitoringNamespace() string {
|
|
if ns := os.Getenv("MONITORING_NAMESPACE"); ns != "" {
|
|
return ns
|
|
}
|
|
|
|
return defaultMonitoringNs
|
|
}
|
|
|
|
func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor {
|
|
return &promv1.ServiceMonitor{
|
|
TypeMeta: metav1.TypeMeta{
|
|
APIVersion: promv1.SchemeGroupVersion.String(),
|
|
Kind: "ServiceMonitor",
|
|
},
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Namespace: namespace,
|
|
Name: monitorName,
|
|
Labels: map[string]string{
|
|
"openshift.io/cluster-monitoring": "",
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
Spec: promv1.ServiceMonitorSpec{
|
|
Selector: metav1.LabelSelector{
|
|
MatchLabels: map[string]string{
|
|
common.PrometheusLabelKey: common.PrometheusLabelValue,
|
|
},
|
|
},
|
|
NamespaceSelector: promv1.NamespaceSelector{
|
|
MatchNames: []string{namespace},
|
|
},
|
|
Endpoints: []promv1.Endpoint{
|
|
{
|
|
Port: "metrics",
|
|
Scheme: "http",
|
|
TLSConfig: &promv1.TLSConfig{
|
|
InsecureSkipVerify: true,
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
func generateAlertRule(alert, expr, duration string, annotations, labels map[string]string) promv1.Rule {
|
|
return promv1.Rule{
|
|
Alert: alert,
|
|
Expr: intstr.FromString(expr),
|
|
For: duration,
|
|
Annotations: annotations,
|
|
Labels: labels,
|
|
}
|
|
}
|
|
|
|
func generateRecordRule(record, expr string) promv1.Rule {
|
|
return promv1.Rule{
|
|
Record: record,
|
|
Expr: intstr.FromString(expr),
|
|
}
|
|
}
|
|
|
|
func (r *ReconcileCDI) watchPrometheusResources() error {
|
|
var err error
|
|
|
|
err = r.controller.Watch(
|
|
&source.Kind{Type: &promv1.PrometheusRule{}},
|
|
enqueueCDI(r.client),
|
|
)
|
|
if err != nil {
|
|
if meta.IsNoMatchError(err) {
|
|
log.Info("Not watching PrometheusRules")
|
|
return nil
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
err = r.controller.Watch(
|
|
&source.Kind{Type: &promv1.ServiceMonitor{}},
|
|
enqueueCDI(r.client),
|
|
)
|
|
if err != nil {
|
|
if meta.IsNoMatchError(err) {
|
|
log.Info("Not watching ServiceMonitors")
|
|
return nil
|
|
}
|
|
|
|
return err
|
|
}
|
|
|
|
err = r.controller.Watch(
|
|
&source.Kind{Type: &rbacv1.Role{}},
|
|
enqueueCDI(r.client),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = r.controller.Watch(
|
|
&source.Kind{Type: &rbacv1.RoleBinding{}},
|
|
enqueueCDI(r.client),
|
|
)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|