containerized-data-importer/pkg/operator/controller/prometheus.go
akalenyu 50c93e8b0e
Deploy alerts infra as part of our installation (#1979)
* Deploy alerts infra as part of our installation

Conditionally deploy the infrastructure that is needed to fire alerts for our users
when bad things are happening to CDI.

Testing with `KUBEVIRT_DEPLOY_PROMETHEUS=true`

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* Watch and unit test all prometheus related resources

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* add gateway for changing monitoring namespace (rbac purposes)

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* refactor test to check for exact alert name and firing state

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* Align pattern of ensuring prometheus resource exists for all

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* Remove potential noisy event

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* Extract duplicate code to function

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>

* Dont use empty value for prometheus label due to open issue

https://github.com/prometheus-operator/prometheus-operator/issues/4325

Signed-off-by: Alex Kalenyuk <akalenyu@redhat.com>
2021-10-26 21:26:07 +02:00

415 lines
10 KiB
Go

/*
Copyright 2018 The CDI Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package controller
import (
"context"
"fmt"
"os"
"reflect"
promv1 "github.com/coreos/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/go-logr/logr"
rbacv1 "k8s.io/api/rbac/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"kubevirt.io/containerized-data-importer/pkg/common"
"kubevirt.io/containerized-data-importer/pkg/controller"
"kubevirt.io/containerized-data-importer/pkg/util"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/source"
)
const (
ruleName = "prometheus-cdi-rules"
rbacName = "cdi-monitoring"
monitorName = "service-monitor-cdi"
defaultMonitoringNs = "monitoring"
)
func ensurePrometheusRuleExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
namespace := owner.GetNamespace()
if namespace == "" {
return fmt.Errorf("cluster scoped owner not supported")
}
cr, err := controller.GetActiveCDI(c)
if err != nil {
return err
}
if cr == nil {
return fmt.Errorf("no active CDI")
}
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
desiredRule := newPrometheusRule(namespace)
util.SetRecommendedLabels(desiredRule, installerLabels, "cdi-operator")
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
return err
} else if !deployed {
return nil
}
if err := c.Create(context.TODO(), desiredRule); err != nil {
if errors.IsAlreadyExists(err) {
currentRule := &promv1.PrometheusRule{}
key := client.ObjectKey{Namespace: namespace, Name: ruleName}
if err := c.Get(context.TODO(), key, currentRule); err != nil {
return err
}
if !reflect.DeepEqual(currentRule.Spec, desiredRule.Spec) {
currentRule.Spec = desiredRule.Spec
if err := c.Update(context.TODO(), currentRule); err != nil {
return err
}
}
} else {
return err
}
}
return nil
}
func ensurePrometheusRbacExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
namespace := owner.GetNamespace()
if namespace == "" {
return fmt.Errorf("cluster scoped owner not supported")
}
cr, err := controller.GetActiveCDI(c)
if err != nil {
return err
}
if cr == nil {
return fmt.Errorf("no active CDI")
}
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
desiredRole := newPrometheusRole(namespace)
util.SetRecommendedLabels(desiredRole, installerLabels, "cdi-operator")
desiredRoleBinding := newPrometheusRoleBinding(namespace)
util.SetRecommendedLabels(desiredRoleBinding, installerLabels, "cdi-operator")
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
return err
} else if !deployed {
return nil
}
key := client.ObjectKey{Namespace: namespace, Name: rbacName}
if err := c.Create(context.TODO(), desiredRole); err != nil {
if errors.IsAlreadyExists(err) {
currentRole := &rbacv1.Role{}
if err := c.Get(context.TODO(), key, currentRole); err != nil {
return err
}
if !reflect.DeepEqual(currentRole.Rules, desiredRole.Rules) {
currentRole.Rules = desiredRole.Rules
if err := c.Update(context.TODO(), currentRole); err != nil {
return err
}
}
} else {
return err
}
}
if err := c.Create(context.TODO(), desiredRoleBinding); err != nil {
if errors.IsAlreadyExists(err) {
currentRoleBinding := &rbacv1.RoleBinding{}
if err := c.Get(context.TODO(), key, currentRoleBinding); err != nil {
return err
}
if !reflect.DeepEqual(currentRoleBinding.Subjects, desiredRoleBinding.Subjects) {
currentRoleBinding.Subjects = desiredRoleBinding.Subjects
if err := c.Update(context.TODO(), currentRoleBinding); err != nil {
return err
}
}
} else {
return err
}
}
return nil
}
func ensurePrometheusServiceMonitorExists(logger logr.Logger, c client.Client, owner metav1.Object) error {
namespace := owner.GetNamespace()
if namespace == "" {
return fmt.Errorf("cluster scoped owner not supported")
}
cr, err := controller.GetActiveCDI(c)
if err != nil {
return err
}
if cr == nil {
return fmt.Errorf("no active CDI")
}
installerLabels := util.GetRecommendedInstallerLabelsFromCr(cr)
desiredMonitor := newPrometheusServiceMonitor(namespace)
util.SetRecommendedLabels(desiredMonitor, installerLabels, "cdi-operator")
if deployed, err := isPrometheusDeployed(logger, c, namespace); err != nil {
return err
} else if !deployed {
return nil
}
key := client.ObjectKey{Namespace: namespace, Name: monitorName}
if err := c.Create(context.TODO(), desiredMonitor); err != nil {
if errors.IsAlreadyExists(err) {
currentMonitor := &promv1.ServiceMonitor{}
if err := c.Get(context.TODO(), key, currentMonitor); err != nil {
return err
}
if !reflect.DeepEqual(currentMonitor.Spec, desiredMonitor.Spec) {
currentMonitor.Spec = desiredMonitor.Spec
if err := c.Update(context.TODO(), currentMonitor); err != nil {
return err
}
}
} else {
return err
}
}
return nil
}
func isPrometheusDeployed(logger logr.Logger, c client.Client, namespace string) (bool, error) {
rule := &promv1.PrometheusRule{}
key := client.ObjectKey{Namespace: namespace, Name: ruleName}
if err := c.Get(context.TODO(), key, rule); err != nil {
if meta.IsNoMatchError(err) {
logger.V(3).Info("No match error for PrometheusRule, must not have prometheus deployed")
return false, nil
} else if !errors.IsNotFound(err) {
return false, err
}
}
return true, nil
}
func newPrometheusRule(namespace string) *promv1.PrometheusRule {
return &promv1.PrometheusRule{
TypeMeta: metav1.TypeMeta{
APIVersion: promv1.SchemeGroupVersion.String(),
Kind: "PrometheusRule",
},
ObjectMeta: metav1.ObjectMeta{
Name: ruleName,
Namespace: namespace,
Labels: map[string]string{
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
Spec: promv1.PrometheusRuleSpec{
Groups: []promv1.RuleGroup{
{
Name: "cdi.rules",
Rules: []promv1.Rule{
generateRecordRule(
"cdi_num_up_operators",
fmt.Sprintf("sum(up{namespace='%s', pod=~'cdi-operator-.*'} or vector(0))", namespace),
),
generateAlertRule(
"CdiOperatorDown",
"cdi_num_up_operators == 0",
"5m",
map[string]string{
"summary": "CDI operator is down",
},
map[string]string{
"severity": "warning",
},
),
},
},
},
},
}
}
func newPrometheusRole(namespace string) *rbacv1.Role {
return &rbacv1.Role{
ObjectMeta: metav1.ObjectMeta{
Name: rbacName,
Namespace: namespace,
Labels: map[string]string{
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
Rules: []rbacv1.PolicyRule{
{
APIGroups: []string{
"",
},
Resources: []string{
"services",
"endpoints",
"pods",
},
Verbs: []string{
"get", "list", "watch",
},
},
},
}
}
func newPrometheusRoleBinding(namespace string) *rbacv1.RoleBinding {
monitoringNamespace := getMonitoringNamespace()
return &rbacv1.RoleBinding{
ObjectMeta: metav1.ObjectMeta{
Name: rbacName,
Namespace: namespace,
Labels: map[string]string{
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
RoleRef: rbacv1.RoleRef{
APIGroup: "rbac.authorization.k8s.io",
Kind: "Role",
Name: rbacName,
},
Subjects: []rbacv1.Subject{
{
Kind: "ServiceAccount",
Namespace: monitoringNamespace,
Name: "prometheus-k8s",
},
},
}
}
func getMonitoringNamespace() string {
if ns := os.Getenv("MONITORING_NAMESPACE"); ns != "" {
return ns
}
return defaultMonitoringNs
}
func newPrometheusServiceMonitor(namespace string) *promv1.ServiceMonitor {
return &promv1.ServiceMonitor{
TypeMeta: metav1.TypeMeta{
APIVersion: promv1.SchemeGroupVersion.String(),
Kind: "ServiceMonitor",
},
ObjectMeta: metav1.ObjectMeta{
Namespace: namespace,
Name: monitorName,
Labels: map[string]string{
"openshift.io/cluster-monitoring": "",
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
Spec: promv1.ServiceMonitorSpec{
Selector: metav1.LabelSelector{
MatchLabels: map[string]string{
common.PrometheusLabelKey: common.PrometheusLabelValue,
},
},
NamespaceSelector: promv1.NamespaceSelector{
MatchNames: []string{namespace},
},
Endpoints: []promv1.Endpoint{
{
Port: "metrics",
Scheme: "http",
TLSConfig: &promv1.TLSConfig{
InsecureSkipVerify: true,
},
},
},
},
}
}
func generateAlertRule(alert, expr, duration string, annotations, labels map[string]string) promv1.Rule {
return promv1.Rule{
Alert: alert,
Expr: intstr.FromString(expr),
For: duration,
Annotations: annotations,
Labels: labels,
}
}
func generateRecordRule(record, expr string) promv1.Rule {
return promv1.Rule{
Record: record,
Expr: intstr.FromString(expr),
}
}
func (r *ReconcileCDI) watchPrometheusResources() error {
var err error
err = r.controller.Watch(
&source.Kind{Type: &promv1.PrometheusRule{}},
enqueueCDI(r.client),
)
if err != nil {
if meta.IsNoMatchError(err) {
log.Info("Not watching PrometheusRules")
return nil
}
return err
}
err = r.controller.Watch(
&source.Kind{Type: &promv1.ServiceMonitor{}},
enqueueCDI(r.client),
)
if err != nil {
if meta.IsNoMatchError(err) {
log.Info("Not watching ServiceMonitors")
return nil
}
return err
}
err = r.controller.Watch(
&source.Kind{Type: &rbacv1.Role{}},
enqueueCDI(r.client),
)
if err != nil {
return err
}
err = r.controller.Watch(
&source.Kind{Type: &rbacv1.RoleBinding{}},
enqueueCDI(r.client),
)
if err != nil {
return err
}
return nil
}