diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 53b8d47a..a6b37dbe 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -39,6 +39,13 @@ rules: - services/status verbs: - get +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch - apiGroups: - "" resources: diff --git a/controllers/event_recorder.go b/controllers/event_recorder.go new file mode 100644 index 00000000..b5cf321a --- /dev/null +++ b/controllers/event_recorder.go @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" +) + +// noOpEventRecorder is a record.EventRecorder that discards all events. +// It is used as a default so that a reconciler's Recorder is never nil, +// allowing event call-sites to invoke it unconditionally. +type noOpEventRecorder struct{} + +var _ record.EventRecorder = noOpEventRecorder{} + +func (noOpEventRecorder) Event(_ runtime.Object, _, _, _ string) {} + +func (noOpEventRecorder) Eventf(_ runtime.Object, _, _, _ string, _ ...interface{}) {} + +func (noOpEventRecorder) AnnotatedEventf(_ runtime.Object, _ map[string]string, _, _, _ string, _ ...interface{}) { +} diff --git a/controllers/events_test.go b/controllers/events_test.go new file mode 100644 index 00000000..f2cdb56f --- /dev/null +++ b/controllers/events_test.go @@ -0,0 +1,298 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "context" + "errors" + "strings" + "testing" + + solrv1beta1 "github.com/apache/solr-operator/api/v1beta1" + "github.com/apache/solr-operator/controllers/util" + "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" +) + +// errForcedPatchFailure is returned by the fake client to force the status patch to fail, +// so that the "could not patch readiness condition" event path is exercised. +var errForcedPatchFailure = errors.New("forced patch failure") + +// requireEvent asserts that one event was recorded whose type and reason match. +// record.FakeRecorder formats each event as " ". +func requireEvent(t *testing.T, rec *record.FakeRecorder, wantType, wantReason string) { + t.Helper() + select { + case got := <-rec.Events: + if !strings.HasPrefix(got, wantType+" "+wantReason+" ") { + t.Errorf("expected event of type %q with reason %q, got %q", wantType, wantReason, got) + } + default: + t.Fatalf("expected an event with reason %q to be recorded, but none was", wantReason) + } +} + +// requireNoEvent asserts that no event is currently buffered on the recorder. +func requireNoEvent(t *testing.T, rec *record.FakeRecorder) { + t.Helper() + select { + case got := <-rec.Events: + t.Fatalf("expected no event to be recorded, but got %q", got) + default: + } +} + +// solrCloudWithStorageRequest builds a minimal SolrCloud requesting the given persistent data size. +func solrCloudWithStorageRequest(size string) *solrv1beta1.SolrCloud { + return &solrv1beta1.SolrCloud{ + ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}, + Spec: solrv1beta1.SolrCloudSpec{ + StorageOptions: solrv1beta1.SolrDataStorageOptions{ + PersistentStorage: &solrv1beta1.SolrPersistentDataStorageOptions{ + PersistentVolumeClaimTemplate: solrv1beta1.PersistentVolumeClaimTemplate{ + Spec: corev1.PersistentVolumeClaimSpec{ + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse(size), + }, + }, + }, + }, + }, + }, + }, + } +} + +// podWithReadinessGate builds a Pod that advertises the given readiness gate (and no status yet). +func podWithReadinessGate(condType corev1.PodConditionType) *corev1.Pod { + return &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "test"}, + Spec: corev1.PodSpec{ + ReadinessGates: []corev1.PodReadinessGate{{ConditionType: condType}}, + }, + } +} + +// failingStatusPatchClient returns a fake client whose Status().Patch always fails, so that +// readiness-condition patch failures (and their events) can be exercised deterministically. +func failingStatusPatchClient() client.Client { + return fake.NewClientBuilder(). + WithScheme(clientgoscheme.Scheme). + WithInterceptorFuncs(interceptor.Funcs{ + SubResourcePatch: func(_ context.Context, _ client.Client, _ string, _ client.Object, _ client.Patch, _ ...client.SubResourcePatchOption) error { + return errForcedPatchFailure + }, + }). + Build() +} + +// TestDeterminePvcExpansionEmitsErrorEventOnBadAnnotation verifies a PVCExpansionError warning is +// emitted when the existing minimum-size annotation recorded on the StatefulSet cannot be parsed. +func TestDeterminePvcExpansionEmitsErrorEventOnBadAnnotation(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := solrCloudWithStorageRequest("5Gi") + sts := &appsv1.StatefulSet{} + sts.Annotations = map[string]string{util.StorageMinimumSizeAnnotation: "not-a-quantity"} + + clusterOp, _, err := determinePvcExpansionClusterOpLockIfNecessary(context.Background(), r, instance, sts, logr.Discard()) + if err == nil { + t.Error("expected an error parsing the existing PVC size annotation, got nil") + } + if clusterOp != nil { + t.Errorf("expected no cluster operation to be started, got %+v", clusterOp) + } + requireEvent(t, rec, corev1.EventTypeWarning, "PVCExpansionError") +} + +// TestDeterminePvcExpansionEmitsForbiddenEventOnShrink verifies a PVCExpansionForbidden warning is +// emitted (and no cluster op started) when the requested size is smaller than the existing size. +func TestDeterminePvcExpansionEmitsForbiddenEventOnShrink(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := solrCloudWithStorageRequest("5Gi") + sts := &appsv1.StatefulSet{} + sts.Annotations = map[string]string{util.StorageMinimumSizeAnnotation: "10Gi"} + + clusterOp, _, err := determinePvcExpansionClusterOpLockIfNecessary(context.Background(), r, instance, sts, logr.Discard()) + if err != nil { + t.Errorf("did not expect an error for a shrink request, got %v", err) + } + if clusterOp != nil { + t.Errorf("expected no cluster operation for a shrink request, got %+v", clusterOp) + } + requireEvent(t, rec, corev1.EventTypeWarning, "PVCExpansionForbidden") +} + +// TestDeterminePvcExpansionNoEventWhenSizeUnchanged verifies that the steady-state path (requested +// size matches the recorded size) neither starts a cluster op nor records an event. +func TestDeterminePvcExpansionNoEventWhenSizeUnchanged(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := solrCloudWithStorageRequest("5Gi") + sts := &appsv1.StatefulSet{} + sts.Annotations = map[string]string{util.StorageMinimumSizeAnnotation: "5Gi"} + + clusterOp, _, err := determinePvcExpansionClusterOpLockIfNecessary(context.Background(), r, instance, sts, logr.Discard()) + if err != nil { + t.Errorf("did not expect an error, got %v", err) + } + if clusterOp != nil { + t.Errorf("expected no cluster operation, got %+v", clusterOp) + } + requireNoEvent(t, rec) +} + +// TestHandleManagedScaleDownEmitsEventOnBadMetadata verifies a ClusterOperationError warning is +// emitted when the scale-down target stored in the cluster-operation metadata cannot be parsed. +func TestHandleManagedScaleDownEmitsEventOnBadMetadata(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := &solrv1beta1.SolrCloud{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}} + clusterOp := &SolrClusterOp{Operation: ScaleDownLock, Metadata: "not-an-int"} + + _, _, _, err := handleManagedCloudScaleDown(context.Background(), r, instance, &appsv1.StatefulSet{}, clusterOp, nil, logr.Discard()) + if err == nil { + t.Error("expected an error parsing the scale-down metadata, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "ClusterOperationError") +} + +// TestInitializePodEmitsEventOnPatchFailure verifies that a failed readiness-condition patch while +// starting traffic on a pod surfaces a PodReadinessConditionUpdateFailed warning. +func TestInitializePodEmitsEventOnPatchFailure(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Client: failingStatusPatchClient(), Recorder: rec} + instance := &solrv1beta1.SolrCloud{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}} + pod := podWithReadinessGate(util.SolrIsNotStoppedReadinessCondition) + + if _, err := r.initializePod(context.Background(), instance, pod, logr.Discard()); err == nil { + t.Error("expected the forced patch failure to be returned, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "PodReadinessConditionUpdateFailed") +} + +// TestEnsurePodReadinessConditionsEmitsEventOnPatchFailure verifies that a failed readiness-condition +// patch while stopping traffic on a pod surfaces a PodReadinessConditionUpdateFailed warning. +func TestEnsurePodReadinessConditionsEmitsEventOnPatchFailure(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Client: failingStatusPatchClient(), Recorder: rec} + instance := &solrv1beta1.SolrCloud{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}} + + pod := podWithReadinessGate(util.SolrIsNotStoppedReadinessCondition) + // Seed an existing condition with a different reason so a change (and thus a patch) is required. + pod.Status.Conditions = []corev1.PodCondition{{ + Type: util.SolrIsNotStoppedReadinessCondition, + Status: corev1.ConditionTrue, + Reason: string(PodStarted), + }} + ensureConditions := map[corev1.PodConditionType]podReadinessConditionChange{ + util.SolrIsNotStoppedReadinessCondition: { + reason: PodUpdate, + message: "Pod is being deleted, traffic to the pod must be stopped", + status: false, + }, + } + + if _, err := EnsurePodReadinessConditions(context.Background(), r, instance, pod, ensureConditions, logr.Discard()); err == nil { + t.Error("expected the forced patch failure to be returned, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "PodReadinessConditionUpdateFailed") +} + +// TestHandleManagedScaleUpEmitsEventOnBadMetadata verifies a ClusterOperationError warning is +// emitted when the scale-up target stored in the cluster-operation metadata cannot be parsed. +func TestHandleManagedScaleUpEmitsEventOnBadMetadata(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := &solrv1beta1.SolrCloud{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}} + clusterOp := &SolrClusterOp{Operation: ScaleUpLock, Metadata: "not-an-int"} + + if _, _, err := handleManagedCloudScaleUp(context.Background(), r, instance, &appsv1.StatefulSet{}, clusterOp, nil, logr.Discard()); err == nil { + t.Error("expected an error parsing the scale-up metadata, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "ClusterOperationError") +} + +// TestHandlePvcExpansionEmitsEventOnBadMetadata verifies a PVCExpansionError warning is emitted +// when the target PVC size stored in the cluster-operation metadata cannot be parsed. +func TestHandlePvcExpansionEmitsEventOnBadMetadata(t *testing.T) { + rec := record.NewFakeRecorder(8) + r := &SolrCloudReconciler{Recorder: rec} + instance := &solrv1beta1.SolrCloud{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "test"}} + clusterOp := &SolrClusterOp{Operation: PvcExpansionLock, Metadata: "not-a-quantity"} + + if _, _, err := handlePvcExpansion(context.Background(), r, instance, &appsv1.StatefulSet{}, clusterOp, logr.Discard()); err == nil { + t.Error("expected an error parsing the PVC expansion metadata, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "PVCExpansionError") +} + +// TestReconcileSolrCloudBackupEmitsCloudNotReadyEvent verifies a BackupCloudNotReady warning is +// emitted when a backup is attempted against a repository that the SolrCloud has not yet marked +// available. A GCS repository is used so that EnsureDirectoryForBackup is a no-op (no pod exec). +func TestReconcileSolrCloudBackupEmitsCloudNotReadyEvent(t *testing.T) { + scheme := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(scheme); err != nil { + t.Fatalf("could not add client-go types to scheme: %v", err) + } + if err := solrv1beta1.AddToScheme(scheme); err != nil { + t.Fatalf("could not add solr types to scheme: %v", err) + } + + solrCloud := &solrv1beta1.SolrCloud{ + ObjectMeta: metav1.ObjectMeta{Name: "cloud", Namespace: "test"}, + Spec: solrv1beta1.SolrCloudSpec{ + BackupRepositories: []solrv1beta1.SolrBackupRepository{{ + Name: "test-repo", + GCS: &solrv1beta1.GcsRepository{Bucket: "test-bucket"}, + }}, + }, + Status: solrv1beta1.SolrCloudStatus{ + BackupRepositoriesAvailable: map[string]bool{"test-repo": false}, + }, + } + backup := &solrv1beta1.SolrBackup{ + ObjectMeta: metav1.ObjectMeta{Name: "backup", Namespace: "test"}, + Spec: solrv1beta1.SolrBackupSpec{ + SolrCloud: "cloud", + RepositoryName: "test-repo", + }, + } + + rec := record.NewFakeRecorder(8) + r := &SolrBackupReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(solrCloud).Build(), + Recorder: rec, + } + + if _, _, err := r.reconcileSolrCloudBackup(context.Background(), backup, &backup.Status.IndividualSolrBackupStatus, logr.Discard()); err == nil { + t.Error("expected a 'cloud not ready' error, got nil") + } + requireEvent(t, rec, corev1.EventTypeWarning, "BackupCloudNotReady") +} diff --git a/controllers/reconcile_events_test.go b/controllers/reconcile_events_test.go new file mode 100644 index 00000000..8655e8e1 --- /dev/null +++ b/controllers/reconcile_events_test.go @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package controllers + +import ( + "context" + "fmt" + + solrv1beta1 "github.com/apache/solr-operator/api/v1beta1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// expectEvent waits until at least one Event of the given type and reason has been recorded against +// the given object. Events are matched on the involved object's UID so that events left over from +// other specs (cleanupTest does not delete Events) cannot cause a false positive. +func expectEvent(ctx context.Context, involvedObject client.Object, eventType, reason string, additionalOffset ...int) { + EventuallyWithOffset(resolveOffset(additionalOffset), func(g Gomega) { + eventList := &corev1.EventList{} + g.Expect(k8sClient.List(ctx, eventList, client.InNamespace(involvedObject.GetNamespace()))).To(Succeed()) + found := false + for i := range eventList.Items { + e := &eventList.Items[i] + if e.InvolvedObject.UID == involvedObject.GetUID() && e.Type == eventType && e.Reason == reason { + found = true + break + } + } + g.Expect(found).To(BeTrue(), fmt.Sprintf("expected a %q event with reason %q on %s/%s", eventType, reason, involvedObject.GetNamespace(), involvedObject.GetName())) + }).Should(Succeed()) +} + +var _ = FDescribe("SolrCloud controller - Events", func() { + var ( + solrCloud *solrv1beta1.SolrCloud + ) + + BeforeEach(func() { + solrCloud = &solrv1beta1.SolrCloud{ + ObjectMeta: metav1.ObjectMeta{ + Name: "events", + Namespace: "default", + }, + Spec: solrv1beta1.SolrCloudSpec{ + ZookeeperRef: &solrv1beta1.ZookeeperRef{ + ConnectionInfo: &solrv1beta1.ZookeeperConnectionInfo{ + InternalConnectionString: "host:7271", + }, + }, + }, + } + }) + + AfterEach(func(ctx context.Context) { + cleanupTest(ctx, solrCloud) + }) + + FContext("with a scheduled restart configured", func() { + BeforeEach(func() { + solrCloud.Spec.UpdateStrategy = solrv1beta1.SolrUpdateStrategy{ + RestartSchedule: "@every 10h", + } + }) + + FIt("records a RestartScheduled event on the SolrCloud", func(ctx context.Context) { + By("creating the SolrCloud") + Expect(k8sClient.Create(ctx, solrCloud)).To(Succeed()) + + By("waiting for the SolrCloud to be fully defaulted") + expectSolrCloudWithChecks(ctx, solrCloud, func(g Gomega, found *solrv1beta1.SolrCloud) { + g.Expect(found.WithDefaults(logger)).To(BeFalse(), "The SolrCloud spec should not need to be defaulted eventually") + }) + + By("checking that a RestartScheduled event was recorded") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "RestartScheduled") + }) + }) + + FContext("with a provided ConfigMap that is missing the required keys", func() { + var providedConfigMap *corev1.ConfigMap + + BeforeEach(func() { + providedConfigMap = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-provided-config", + Namespace: solrCloud.Namespace, + }, + // Neither "solr.xml" nor "log4j2.xml" is present, which is invalid. + Data: map[string]string{"unrelated-key": "unrelated-value"}, + } + solrCloud.Spec.CustomSolrKubeOptions = solrv1beta1.CustomSolrKubeOptions{ + ConfigMapOptions: &solrv1beta1.ConfigMapOptions{ + ProvidedConfigMap: providedConfigMap.Name, + }, + } + }) + + FIt("records an InvalidConfigMap event on the SolrCloud", func(ctx context.Context) { + By("creating the invalid ConfigMap") + Expect(k8sClient.Create(ctx, providedConfigMap)).To(Succeed()) + + By("creating the SolrCloud") + Expect(k8sClient.Create(ctx, solrCloud)).To(Succeed()) + + By("waiting for the SolrCloud to be fully defaulted") + expectSolrCloudWithChecks(ctx, solrCloud, func(g Gomega, found *solrv1beta1.SolrCloud) { + g.Expect(found.WithDefaults(logger)).To(BeFalse(), "The SolrCloud spec should not need to be defaulted eventually") + }) + + By("checking that an InvalidConfigMap event was recorded") + expectEvent(ctx, solrCloud, corev1.EventTypeWarning, "InvalidConfigMap") + }) + }) + + FContext("with a provided ConfigMap whose solr.xml is missing the port placeholder", func() { + var providedConfigMap *corev1.ConfigMap + + BeforeEach(func() { + providedConfigMap = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-solrxml-config", + Namespace: solrCloud.Namespace, + }, + // solr.xml is present but does not contain the required port placeholder. + Data: map[string]string{"solr.xml": ""}, + } + solrCloud.Spec.CustomSolrKubeOptions = solrv1beta1.CustomSolrKubeOptions{ + ConfigMapOptions: &solrv1beta1.ConfigMapOptions{ + ProvidedConfigMap: providedConfigMap.Name, + }, + } + }) + + FIt("records an InvalidConfigMap event on the SolrCloud", func(ctx context.Context) { + By("creating the invalid ConfigMap") + Expect(k8sClient.Create(ctx, providedConfigMap)).To(Succeed()) + + By("creating the SolrCloud") + Expect(k8sClient.Create(ctx, solrCloud)).To(Succeed()) + + By("waiting for the SolrCloud to be fully defaulted") + expectSolrCloudWithChecks(ctx, solrCloud, func(g Gomega, found *solrv1beta1.SolrCloud) { + g.Expect(found.WithDefaults(logger)).To(BeFalse(), "The SolrCloud spec should not need to be defaulted eventually") + }) + + By("checking that an InvalidConfigMap event was recorded") + expectEvent(ctx, solrCloud, corev1.EventTypeWarning, "InvalidConfigMap") + }) + }) +}) + +var _ = FDescribe("SolrPrometheusExporter controller - Events", func() { + var ( + solrPrometheusExporter *solrv1beta1.SolrPrometheusExporter + ) + + BeforeEach(func() { + solrPrometheusExporter = &solrv1beta1.SolrPrometheusExporter{ + ObjectMeta: metav1.ObjectMeta{ + Name: "events", + Namespace: "default", + }, + Spec: solrv1beta1.SolrPrometheusExporterSpec{ + SolrReference: solrv1beta1.SolrReference{ + Cloud: &solrv1beta1.SolrCloudReference{ + ZookeeperConnectionInfo: &solrv1beta1.ZookeeperConnectionInfo{ + InternalConnectionString: "host:2181", + }, + }, + }, + RestartSchedule: "@every 10h", + }, + } + }) + + AfterEach(func(ctx context.Context) { + cleanupTest(ctx, solrPrometheusExporter) + }) + + FIt("records a RestartScheduled event on the SolrPrometheusExporter", func(ctx context.Context) { + By("creating the SolrPrometheusExporter") + Expect(k8sClient.Create(ctx, solrPrometheusExporter)).To(Succeed()) + + By("waiting for the SolrPrometheusExporter to be fully defaulted") + expectSolrPrometheusExporterWithChecks(ctx, solrPrometheusExporter, func(g Gomega, found *solrv1beta1.SolrPrometheusExporter) { + g.Expect(found.WithDefaults()).To(BeFalse(), "The SolrPrometheusExporter spec should not need to be defaulted eventually") + }) + + By("checking that a RestartScheduled event was recorded") + expectEvent(ctx, solrPrometheusExporter, corev1.EventTypeNormal, "RestartScheduled") + }) + + FContext("with a provided ConfigMap missing the required key", func() { + var providedConfigMap *corev1.ConfigMap + + BeforeEach(func() { + providedConfigMap = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "invalid-exporter-config", + Namespace: solrPrometheusExporter.Namespace, + }, + // The required exporter config key is absent, which is invalid. + Data: map[string]string{"unrelated-key": "unrelated-value"}, + } + solrPrometheusExporter.Spec.CustomKubeOptions = solrv1beta1.CustomExporterKubeOptions{ + ConfigMapOptions: &solrv1beta1.ConfigMapOptions{ + ProvidedConfigMap: providedConfigMap.Name, + }, + } + }) + + FIt("records an InvalidConfigMap event on the SolrPrometheusExporter", func(ctx context.Context) { + By("creating the invalid ConfigMap") + Expect(k8sClient.Create(ctx, providedConfigMap)).To(Succeed()) + + By("creating the SolrPrometheusExporter") + Expect(k8sClient.Create(ctx, solrPrometheusExporter)).To(Succeed()) + + By("checking that an InvalidConfigMap event was recorded") + expectEvent(ctx, solrPrometheusExporter, corev1.EventTypeWarning, "InvalidConfigMap") + }) + }) +}) diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go index deecd216..20d509e9 100644 --- a/controllers/solr_cluster_ops_util.go +++ b/controllers/solr_cluster_ops_util.go @@ -169,19 +169,15 @@ func determinePvcExpansionClusterOpLockIfNecessary(ctx context.Context, r *SolrC if e != nil { err = e logger.Error(err, "Could not parse the existing minimum PVC size from the StatefulSet annotation", "annotation", util.StorageMinimumSizeAnnotation, "value", oldSizeStr) - if r.Recorder != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionError", - "Could not parse the existing minimum data PVC size %q recorded on the StatefulSet: %v", oldSizeStr, e) - } + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionError", + "Could not parse the existing minimum data PVC size %q recorded on the StatefulSet: %v", oldSizeStr, e) return } // PVCs cannot be shrunk, so only proceed if the new size is strictly bigger than the recorded size. if newSize.Cmp(oldSize) <= 0 { logger.Info("Cannot shrink existing data PVCs; ignoring the decreased storage request", "currentSize", oldSize.String(), "requestedSize", newSize.String()) - if r.Recorder != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionForbidden", - "Cannot shrink data PersistentVolumeClaims from %s to %s; PersistentVolumeClaims can only be expanded.", oldSize.String(), newSize.String()) - } + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionForbidden", + "Cannot shrink data PersistentVolumeClaims from %s to %s; PersistentVolumeClaims can only be expanded.", oldSize.String(), newSize.String()) return } // Pre-flight: make sure the storage class backing the data PVCs allows volume expansion. If it @@ -192,10 +188,8 @@ func determinePvcExpansionClusterOpLockIfNecessary(ctx context.Context, r *SolrC logger.Error(scErr, "Could not verify whether the storage class allows volume expansion; proceeding with the expansion attempt") } else if !allowed { logger.Info("Storage class does not allow volume expansion; ignoring the increased storage request", "storageClass", className, "currentSize", oldSize.String(), "requestedSize", newSize.String()) - if r.Recorder != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionForbidden", - "Storage class %q does not allow volume expansion (allowVolumeExpansion); cannot expand data PersistentVolumeClaims from %s to %s.", className, oldSize.String(), newSize.String()) - } + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionForbidden", + "Storage class %q does not allow volume expansion (allowVolumeExpansion); cannot expand data PersistentVolumeClaims from %s to %s.", className, oldSize.String(), newSize.String()) return } clusterOp = &SolrClusterOp{ @@ -211,6 +205,8 @@ func handlePvcExpansion(ctx context.Context, r *SolrCloudReconciler, instance *s newSize, err = resource.ParseQuantity(clusterOp.Metadata) if err != nil { logger.Error(err, "Could not convert PvcExpansion metadata to a resource.Quantity, as it represents the new size of PVCs", "metadata", clusterOp.Metadata) + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionError", + "Could not parse the target PVC size %q from the cluster operation metadata: %v", clusterOp.Metadata, err) return } var resizeInfeasible bool @@ -235,11 +231,9 @@ func handlePvcExpansion(ctx context.Context, r *SolrCloudReconciler, instance *s // The storage backend has declared the requested size infeasible. There is nothing the // operator can do until the user lowers the requested size, so surface it as an event and // back off significantly instead of retrying tightly. - if r.Recorder != nil { - r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionInfeasible", - "The storage backend reported that expanding the data PersistentVolumeClaims to %s is infeasible (e.g. it exceeds backend or quota limits). Reduce the requested storage size to a feasible value to recover.", - newSize.String()) - } + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PVCExpansionInfeasible", + "The storage backend reported that expanding the data PersistentVolumeClaims to %s is infeasible (e.g. it exceeds backend or quota limits). Reduce the requested storage size to a feasible value to recover.", + newSize.String()) retryLaterDuration = time.Minute } else { retryLaterDuration = time.Second * 5 @@ -292,6 +286,8 @@ func determineScaleClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudRec Metadata: strconv.Itoa(desiredPods), } } else { + r.Recorder.Eventf(instance, corev1.EventTypeNormal, "ScalingUnmanaged", + "Scaling SolrCloud from %d to %d pods without managed replica migration", configuredPods, desiredPods) err = scaleCloudUnmanaged(ctx, r, statefulSet, desiredPods, logger) } } else if scaleDownOpIsQueued { @@ -312,8 +308,9 @@ func handleManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, in var scaleDownTo int if scaleDownTo, err = strconv.Atoi(clusterOp.Metadata); err != nil { logger.Error(err, "Could not convert ScaleDown metadata to int, as it represents the number of nodes to scale to", "metadata", clusterOp.Metadata) + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "ClusterOperationError", + "Could not parse the scale-down target %q from the cluster operation metadata: %v", clusterOp.Metadata, err) return - // TODO: Create event for the CRD. } if len(podList) <= scaleDownTo { @@ -365,7 +362,7 @@ func handleManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, in // cleanupManagedCloudScaleDown does the logic of cleaning-up an incomplete scale down operation. // This will remove any bad readinessConditions that the scaleDown might have set when trying to scaleDown pods. -func cleanupManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, podList []corev1.Pod, logger logr.Logger) (err error) { +func cleanupManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, podList []corev1.Pod, logger logr.Logger) (err error) { // First though, the scaleDown op might have set some pods to be "unready" before deletion. Undo that. // Before doing anything to the pod, make sure that the pods do not have a stopped readiness condition readinessConditions := map[corev1.PodConditionType]podReadinessConditionChange{ @@ -376,7 +373,7 @@ func cleanupManagedCloudScaleDown(ctx context.Context, r *SolrCloudReconciler, p }, } for _, pod := range podList { - if updatedPod, e := EnsurePodReadinessConditions(ctx, r, &pod, readinessConditions, logger); e != nil { + if updatedPod, e := EnsurePodReadinessConditions(ctx, r, instance, &pod, readinessConditions, logger); e != nil { err = e return } else { @@ -393,6 +390,8 @@ func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler, inst desiredPods, err = strconv.Atoi(clusterOp.Metadata) if err != nil { logger.Error(err, "Could not convert ScaleUp metadata to int, as it represents the number of nodes to scale to", "metadata", clusterOp.Metadata) + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "ClusterOperationError", + "Could not parse the scale-up target %q from the cluster operation metadata: %v", clusterOp.Metadata, err) return } configuredPods := int(*statefulSet.Spec.Replicas) @@ -487,6 +486,8 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler // a restart to get a working pod config. state, retryLater, apiError := util.GetNodeReplicaState(ctx, instance, statefulSet, hasReadyPod, logger) if apiError != nil { + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "ClusterStateError", + "Could not fetch the Solr cluster state needed to safely perform a rolling update: %v", apiError) return false, true, 0, nil, apiError } else if !retryLater { // If the cluster status has been successfully fetched, then add the pods scheduled for deletion @@ -526,7 +527,7 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler // cleanupManagedCloudRollingUpdate does the logic of cleaning-up an incomplete rolling update operation. // This will remove any bad readinessConditions that the rollingUpdate might have set when trying to restart pods. -func cleanupManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler, podList []corev1.Pod, logger logr.Logger) (err error) { +func cleanupManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, podList []corev1.Pod, logger logr.Logger) (err error) { // First though, the scaleDown op might have set some pods to be "unready" before deletion. Undo that. // Before doing anything to the pod, make sure that the pods do not have a stopped readiness condition er := EvictingReplicas @@ -546,7 +547,7 @@ func cleanupManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconcile }, } for _, pod := range podList { - if updatedPod, e := EnsurePodReadinessConditions(ctx, r, &pod, readinessConditions, logger); e != nil { + if updatedPod, e := EnsurePodReadinessConditions(ctx, r, instance, &pod, readinessConditions, logger); e != nil { err = e return } else { @@ -635,7 +636,7 @@ func evictAllPods(ctx context.Context, r *SolrCloudReconciler, instance *solrv1b } for i, pod := range podList { - if updatedPod, e := EnsurePodReadinessConditions(ctx, r, &pod, readinessConditions, logger); e != nil { + if updatedPod, e := EnsurePodReadinessConditions(ctx, r, instance, &pod, readinessConditions, logger); e != nil { err = e return } else { @@ -676,7 +677,7 @@ func evictSinglePod(ctx context.Context, r *SolrCloudReconciler, instance *solrv return !podHasReplicas, false, errors.New("Could not find pod " + podName + " when trying to migrate replicas to scale down pod.") } - if updatedPod, e := EnsurePodReadinessConditions(ctx, r, pod, readinessConditions, logger); e != nil { + if updatedPod, e := EnsurePodReadinessConditions(ctx, r, instance, pod, readinessConditions, logger); e != nil { err = e return } else { @@ -685,7 +686,7 @@ func evictSinglePod(ctx context.Context, r *SolrCloudReconciler, instance *solrv // Only evict from the pod if it contains replicas in the clusterState var canDeletePod bool - if err, canDeletePod, requestInProgress = util.EvictReplicasForPodIfNecessary(ctx, instance, pod, podHasReplicas, "scaleDown", logger); err != nil { + if err, canDeletePod, requestInProgress = util.EvictReplicasForPodIfNecessary(ctx, instance, pod, podHasReplicas, "scaleDown", r.Recorder, logger); err != nil { logger.Error(err, "Error while evicting replicas on Pod, when scaling down SolrCloud", "pod", pod.Name) } else if canDeletePod { // The pod previously had replicas, so loop back in the next reconcile to make sure that the pod doesn't diff --git a/controllers/solr_pod_lifecycle_util.go b/controllers/solr_pod_lifecycle_util.go index 84116ad3..c794f07b 100644 --- a/controllers/solr_pod_lifecycle_util.go +++ b/controllers/solr_pod_lifecycle_util.go @@ -64,7 +64,7 @@ func DeletePodForUpdate(ctx context.Context, r *SolrCloudReconciler, instance *s status: false, }, } - if updatedPod, e := EnsurePodReadinessConditions(ctx, r, pod, podStoppedReadinessConditions, logger); e != nil { + if updatedPod, e := EnsurePodReadinessConditions(ctx, r, instance, pod, podStoppedReadinessConditions, logger); e != nil { err = e return } else { @@ -75,7 +75,7 @@ func DeletePodForUpdate(ctx context.Context, r *SolrCloudReconciler, instance *s deletePod := false if PodConditionEquals(pod, util.SolrReplicasNotEvictedReadinessCondition, EvictingReplicas) { // Only evict pods that contain replicas in the clusterState - if evictError, canDeletePod, inProgTmp := util.EvictReplicasForPodIfNecessary(ctx, instance, pod, podHasReplicas, "podUpdate", logger); evictError != nil { + if evictError, canDeletePod, inProgTmp := util.EvictReplicasForPodIfNecessary(ctx, instance, pod, podHasReplicas, "podUpdate", r.Recorder, logger); evictError != nil { requestInProgress = true err = evictError logger.Error(err, "Error while evicting replicas on pod", "pod", pod.Name) @@ -107,15 +107,18 @@ func DeletePodForUpdate(ctx context.Context, r *SolrCloudReconciler, instance *s }) if err != nil { logger.Error(err, "Error while killing solr pod for update", "pod", pod.Name) + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PodUpdateError", + "Error while deleting pod %s for an update: %v", pod.Name, err) + } else { + r.Recorder.Eventf(instance, corev1.EventTypeNormal, "PodUpdate", + "Deleting pod %s so that it can be recreated with the updated SolrCloud specification", pod.Name) } - - // TODO: Create event for the CRD. } return } -func EnsurePodReadinessConditions(ctx context.Context, r *SolrCloudReconciler, pod *corev1.Pod, ensureConditions map[corev1.PodConditionType]podReadinessConditionChange, logger logr.Logger) (updatedPod *corev1.Pod, err error) { +func EnsurePodReadinessConditions(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, pod *corev1.Pod, ensureConditions map[corev1.PodConditionType]podReadinessConditionChange, logger logr.Logger) (updatedPod *corev1.Pod, err error) { updatedPod = pod.DeepCopy() needsUpdate := false @@ -137,7 +140,8 @@ func EnsurePodReadinessConditions(ctx context.Context, r *SolrCloudReconciler, p logger.Error(err, "Could not patch readiness condition(s) for pod to stop traffic", "pod", pod.Name) updatedPod = pod - // TODO: Create event for the CRD. + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PodReadinessConditionUpdateFailed", + "Could not patch readiness condition(s) on pod %s to stop traffic: %v", pod.Name, err) } } else { updatedPod = pod diff --git a/controllers/solrbackup_controller.go b/controllers/solrbackup_controller.go index 1e8c1880..7d9c2444 100644 --- a/controllers/solrbackup_controller.go +++ b/controllers/solrbackup_controller.go @@ -29,11 +29,13 @@ import ( "github.com/apache/solr-operator/controllers/util" "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -45,8 +47,9 @@ import ( // SolrBackupReconciler reconciles a SolrBackup object type SolrBackupReconciler struct { client.Client - Scheme *runtime.Scheme - Config *rest.Config + Scheme *runtime.Scheme + Config *rest.Config + Recorder record.EventRecorder } //+kubebuilder:rbac:groups="",resources=pods/exec,verbs=create @@ -129,6 +132,8 @@ func (r *SolrBackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) if err1 != nil { // TODO Should we be failing the backup for some sub-set of errors here? logger.Error(err1, "Error while taking SolrCloud backup") + r.Recorder.Eventf(backup, corev1.EventTypeWarning, "BackupError", + "Error while taking backup of SolrCloud %q: %v", backup.Spec.SolrCloud, err1) // Requeue after 10 seconds for errors. updateRequeueAfter(&requeueOrNot, time.Second*10) @@ -136,6 +141,14 @@ func (r *SolrBackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) // Set finish time now := metav1.Now() backup.Status.IndividualSolrBackupStatus.FinishTime = &now + + if successful := backup.Status.IndividualSolrBackupStatus.Successful; successful != nil && *successful { + r.Recorder.Eventf(backup, corev1.EventTypeNormal, "BackupSucceeded", + "The backup of SolrCloud %q completed successfully", backup.Spec.SolrCloud) + } else { + r.Recorder.Eventf(backup, corev1.EventTypeWarning, "BackupFailed", + "The backup of SolrCloud %q finished but was not successful", backup.Spec.SolrCloud) + } } else if solrCloud != nil { // When working with the collection backups, auto-requeue after 5 seconds // to check on the status of the async solr backup calls @@ -147,12 +160,16 @@ func (r *SolrBackupReconciler) Reconcile(ctx context.Context, req ctrl.Request) if backup.Status.IndividualSolrBackupStatus.Finished && backup.Spec.Recurrence.IsEnabled() { if nextBackupTime, err1 := util.ScheduleNextBackup(backup.Spec.Recurrence.Schedule, backup.Status.IndividualSolrBackupStatus.StartTime.Time); err1 != nil { logger.Error(err1, "Could not update backup scheduling due to bad cron schedule", "cron", backup.Spec.Recurrence.Schedule) + r.Recorder.Eventf(backup, corev1.EventTypeWarning, "BackupScheduleInvalid", + "Could not schedule the next recurring backup due to an invalid cron schedule %q: %v", backup.Spec.Recurrence.Schedule, err1) } else { convTime := metav1.NewTime(nextBackupTime) if backup.Status.NextScheduledTime == nil || convTime != *backup.Status.NextScheduledTime { // Only log out the message if there is a change in NextScheduled logger.Info("(Re)scheduling Next Backup", "time", nextBackupTime) backup.Status.NextScheduledTime = &convTime + r.Recorder.Eventf(backup, corev1.EventTypeNormal, "BackupRescheduled", + "Scheduling the next recurring backup of SolrCloud %q for %s", backup.Spec.SolrCloud, nextBackupTime.UTC().Format(time.RFC3339)) updateRequeueAfter(&requeueOrNot, backup.Status.NextScheduledTime.Sub(time.Now())) } } @@ -214,12 +231,16 @@ func (r *SolrBackupReconciler) reconcileSolrCloudBackup(ctx context.Context, bac // Make sure that all solr living Solr pods have the backupRepo configured if !solrCloud.Status.BackupRepositoriesAvailable[backupRepository.Name] { logger.Info("Cloud not ready for backup", "solrCloud", solrCloud.Name, "repository", backupRepository.Name) + r.Recorder.Eventf(backup, corev1.EventTypeWarning, "BackupCloudNotReady", + "SolrCloud %q is not yet ready for backups in the %q repository", solrCloud.Name, backupRepository.Name) return solrCloud, actionTaken, errors.NewServiceUnavailable(fmt.Sprintf("Cloud is not ready for backups in the %s repository", backupRepository.Name)) } // Only set the solr version at the start of the backup. This shouldn't change throughout the backup. currentBackupStatus.SolrVersion = solrCloud.Status.Version currentBackupStatus.StartTime = metav1.Now() + r.Recorder.Eventf(backup, corev1.EventTypeNormal, "BackupStarted", + "Started backup of SolrCloud %q to the %q repository", solrCloud.Name, backupRepository.Name) } collectionsToBackup := backup.Spec.Collections @@ -229,6 +250,8 @@ func (r *SolrBackupReconciler) reconcileSolrCloudBackup(ctx context.Context, bac collectionsToBackup, err = util.ListAllSolrCollections(ctx, solrCloud, logger) if err != nil { logger.Error(err, "Error listing collections", "solrCloud", solrCloud.Name) + r.Recorder.Eventf(backup, corev1.EventTypeWarning, "BackupCollectionListError", + "Error listing the collections to back up for SolrCloud %q: %v", solrCloud.Name, err) } } @@ -311,7 +334,9 @@ func reconcileSolrCollectionBackup(ctx context.Context, backup *solrv1beta1.Solr // SetupWithManager sets up the controller with the Manager. func (r *SolrBackupReconciler) SetupWithManager(mgr ctrl.Manager) (err error) { r.Config = mgr.GetConfig() - + if r.Recorder == nil { + r.Recorder = noOpEventRecorder{} + } ctrlBuilder := ctrl.NewControllerManagedBy(mgr). For(&solrv1beta1.SolrBackup{}) diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go index e94ef7bb..5a09b629 100644 --- a/controllers/solrcloud_controller.go +++ b/controllers/solrcloud_controller.go @@ -69,6 +69,7 @@ func UseZkCRD(useCRD bool) { //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;delete //+kubebuilder:rbac:groups="",resources=pods/status,verbs=get;patch +//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups="",resources=services/status,verbs=get //+kubebuilder:rbac:groups=apps,resources=statefulsets,verbs=get;list;watch;create;update;patch;delete @@ -172,7 +173,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if instance.Spec.SolrAddressability.External.UseExternalAddress { if ip == "" { // If we are using this IP in the hostAliases of the statefulSet, it needs to be set for every service before trying to update the statefulSet - // TODO: Make an event here + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "NodeServiceAddressNotReady", + "Waiting for the Kubernetes-assigned address (clusterIP) of node service %q before the StatefulSet can be reconciled, since the SolrCloud advertises its external address via host aliases", nodeName) blockReconciliationOfStatefulSet = true } else { hostNameIpMap[instance.AdvertisedNodeHost(nodeName)] = ip @@ -230,7 +232,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // if there's a user-provided config, it must have one of the expected keys if !hasLogXml && !hasSolrXml { - // TODO: Create event for the CRD. + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "InvalidConfigMap", + "User provided ConfigMap %s must have one of 'solr.xml' and/or 'log4j2.xml'", providedConfigMapName) return requeueOrNot, fmt.Errorf("user provided ConfigMap %s must have one of 'solr.xml' and/or 'log4j2.xml'", providedConfigMapName) } @@ -238,6 +241,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if hasSolrXml { // make sure the user-provided solr.xml is valid if !(strings.Contains(solrXml, "${solr.port.advertise:") || strings.Contains(solrXml, "${hostPort:")) { + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "InvalidConfigMap", + "Custom solr.xml in ConfigMap %s must contain a placeholder for either 'solr.port.advertise' or its deprecated alternative 'hostPort'", providedConfigMapName) return requeueOrNot, fmt.Errorf("custom solr.xml in ConfigMap %s must contain a placeholder for either 'solr.port.advertise', or its deprecated alternative 'hostPort', e.g. ${solr.port.advertise:80}", providedConfigMapName) @@ -256,6 +261,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } else { + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "InvalidConfigMap", + "Provided ConfigMap %s has no data", providedConfigMapName) return requeueOrNot, fmt.Errorf("provided ConfigMap %s has no data", providedConfigMapName) } } @@ -349,7 +356,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if nextRestartAnnotation != "" { // Set the new restart time annotation expectedStatefulSet.Spec.Template.Annotations[util.SolrScheduledRestartAnnotation] = nextRestartAnnotation - // TODO: Create event for the CRD. + r.Recorder.Eventf(instance, corev1.EventTypeNormal, "RestartScheduled", + "Scheduling next restart of the Solr StatefulSet for %s, as configured by the updateStrategy.restartSchedule", nextRestartAnnotation) } else if existingRestartAnnotation, exists := foundStatefulSet.Spec.Template.Annotations[util.SolrScheduledRestartAnnotation]; exists { // Keep the existing nextRestart annotation if it exists and we aren't setting a new one. expectedStatefulSet.Spec.Template.Annotations[util.SolrScheduledRestartAnnotation] = existingRestartAnnotation @@ -508,6 +516,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( operationFound = false // This shouldn't happen, but we don't want to be stuck if it does. // Just remove the cluster Op, because the solr operator version running does not support it. + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "ClusterOperationUnsupported", + "Removing cluster operation %q because it is not supported by this version of the Solr Operator", string(clusterOp.Operation)) err = clearClusterOpLockWithPatch(ctx, r, statefulSet, "clusterOp not supported", logger) } if operationFound { @@ -521,7 +531,10 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( err = setNextClusterOpLockWithPatch(ctx, r, statefulSet, nextClusterOperation, string(clusterOp.Operation)+" complete", logger) } - // TODO: Create event for the CRD. + if err == nil { + r.Recorder.Eventf(instance, corev1.EventTypeNormal, "ClusterOperationComplete", + "Completed cluster operation %q on the SolrCloud", string(clusterOp.Operation)) + } } else if !requestInProgress { // If the cluster operation is in a stoppable place (not currently doing an async operation), and either: // - the operation hit an error and has taken more than 1 minute @@ -543,15 +556,16 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( // If the operation is being queued, first have the operation cleanup after itself switch clusterOp.Operation { case UpdateLock: - err = cleanupManagedCloudRollingUpdate(ctx, r, outOfDatePods.ScheduledForDeletion, logger) + err = cleanupManagedCloudRollingUpdate(ctx, r, instance, outOfDatePods.ScheduledForDeletion, logger) case ScaleDownLock: - err = cleanupManagedCloudScaleDown(ctx, r, podList, logger) + err = cleanupManagedCloudScaleDown(ctx, r, instance, podList, logger) } if err == nil { err = enqueueCurrentClusterOpForRetryWithPatch(ctx, r, statefulSet, string(clusterOp.Operation)+" "+queueForLaterReason, logger) } - // TODO: Create event for the CRD. + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "ClusterOperationRetry", + "Pausing cluster operation %q because it %s; it has been queued to be retried later", string(clusterOp.Operation), queueForLaterReason) } } } @@ -612,6 +626,8 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( logger.Error(err, "Error while patching StatefulSet to start locked clusterOp", clusterOp.Operation, "clusterOpMetadata", clusterOp.Metadata) } else { logger.Info("Started locked clusterOp", "clusterOp", clusterOp.Operation, "clusterOpMetadata", clusterOp.Metadata) + r.Recorder.Eventf(instance, corev1.EventTypeNormal, "ClusterOperationStarted", + "Starting cluster operation %q on the SolrCloud", string(clusterOp.Operation)) } } else { // No new clusterOperation has been started, retry the next queued clusterOp, if there are any operations in the retry queue. @@ -787,7 +803,7 @@ func (r *SolrCloudReconciler) initializePods(ctx context.Context, solrCloud *sol if !isOwnedByCurrentStatefulSet { continue } - if updatedPod, podError := r.initializePod(ctx, &pod, logger); podError != nil { + if updatedPod, podError := r.initializePod(ctx, solrCloud, &pod, logger); podError != nil { err = podError } else if updatedPod != nil { podList = append(podList, *updatedPod) @@ -797,7 +813,7 @@ func (r *SolrCloudReconciler) initializePods(ctx context.Context, solrCloud *sol } // InitializePod Initialize Status Conditions for a SolrCloud Pod -func (r *SolrCloudReconciler) initializePod(ctx context.Context, pod *corev1.Pod, logger logr.Logger) (updatedPod *corev1.Pod, err error) { +func (r *SolrCloudReconciler) initializePod(ctx context.Context, instance *solrv1beta1.SolrCloud, pod *corev1.Pod, logger logr.Logger) (updatedPod *corev1.Pod, err error) { shouldPatchPod := false updatedPod = pod.DeepCopy() @@ -815,7 +831,8 @@ func (r *SolrCloudReconciler) initializePod(ctx context.Context, pod *corev1.Pod // set the pod back to its original state since the patch failed updatedPod = pod - // TODO: Create event for the CRD. + r.Recorder.Eventf(instance, corev1.EventTypeWarning, "PodReadinessConditionUpdateFailed", + "Could not patch readiness condition(s) on pod %s to start traffic: %v", pod.Name, err) } } return @@ -1342,6 +1359,9 @@ func (r *SolrCloudReconciler) reconcileTLSConfig(instance *solrv1beta1.SolrCloud // SetupWithManager sets up the controller with the Manager. func (r *SolrCloudReconciler) SetupWithManager(mgr ctrl.Manager) error { + if r.Recorder == nil { + r.Recorder = noOpEventRecorder{} + } ctrlBuilder := ctrl.NewControllerManagedBy(mgr). For(&solrv1beta1.SolrCloud{}). Owns(&corev1.ConfigMap{}). diff --git a/controllers/solrprometheusexporter_controller.go b/controllers/solrprometheusexporter_controller.go index e9c886c2..1251208c 100644 --- a/controllers/solrprometheusexporter_controller.go +++ b/controllers/solrprometheusexporter_controller.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" @@ -43,7 +44,8 @@ import ( // SolrPrometheusExporterReconciler reconciles a SolrPrometheusExporter object type SolrPrometheusExporterReconciler struct { client.Client - Scheme *runtime.Scheme + Scheme *runtime.Scheme + Recorder record.EventRecorder } //+kubebuilder:rbac:groups=,resources=configmaps,verbs=get;list;watch;create;update;patch;delete @@ -104,10 +106,14 @@ func (r *SolrPrometheusExporterReconciler) Reconcile(ctx context.Context, req ct if ok { configXmlMd5 = fmt.Sprintf("%x", md5.Sum([]byte(configXml))) } else { + r.Recorder.Eventf(prometheusExporter, corev1.EventTypeWarning, "InvalidConfigMap", + "Provided ConfigMap %s must contain the required %q key", prometheusExporter.Spec.CustomKubeOptions.ConfigMapOptions.ProvidedConfigMap, configMapKey) return requeueOrNot, fmt.Errorf("required '%s' key not found in provided ConfigMap %s", configMapKey, prometheusExporter.Spec.CustomKubeOptions.ConfigMapOptions.ProvidedConfigMap) } } else { + r.Recorder.Eventf(prometheusExporter, corev1.EventTypeWarning, "InvalidConfigMap", + "Provided ConfigMap %s has no data", prometheusExporter.Spec.CustomKubeOptions.ConfigMapOptions.ProvidedConfigMap) return requeueOrNot, fmt.Errorf("provided ConfigMap %s has no data", prometheusExporter.Spec.CustomKubeOptions.ConfigMapOptions.ProvidedConfigMap) } @@ -224,7 +230,8 @@ func (r *SolrPrometheusExporterReconciler) Reconcile(ctx context.Context, req ct } // Set the new restart time annotation deploy.Spec.Template.Annotations[util.SolrScheduledRestartAnnotation] = nextRestartAnnotation - // TODO: Create event for the CRD. + r.Recorder.Eventf(prometheusExporter, corev1.EventTypeNormal, "RestartScheduled", + "Scheduling next restart of the Prometheus Exporter Deployment for %s, as configured by the restartSchedule", nextRestartAnnotation) } else if existingRestartAnnotation, exists := foundDeploy.Spec.Template.Annotations[util.SolrScheduledRestartAnnotation]; exists { if deploy.Spec.Template.Annotations == nil { deploy.Spec.Template.Annotations = make(map[string]string, 1) @@ -341,6 +348,9 @@ func (r *SolrPrometheusExporterReconciler) reconcileTLSConfig(prometheusExporter // SetupWithManager sets up the controller with the Manager. func (r *SolrPrometheusExporterReconciler) SetupWithManager(mgr ctrl.Manager) error { + if r.Recorder == nil { + r.Recorder = noOpEventRecorder{} + } ctrlBuilder := ctrl.NewControllerManagedBy(mgr). For(&solrv1beta1.SolrPrometheusExporter{}). Owns(&corev1.ConfigMap{}). diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 7b89ee81..81209a9f 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -108,17 +108,19 @@ var _ = BeforeSuite(func(ctx context.Context) { Expect((&SolrCloudReconciler{ Client: k8sManager.GetClient(), Scheme: k8sManager.GetScheme(), - Recorder: k8sManager.GetEventRecorderFor("solrcloud-controller"), + Recorder: k8sManager.GetEventRecorderFor("solr-operator"), }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&SolrPrometheusExporterReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + Recorder: k8sManager.GetEventRecorderFor("solr-operator"), }).SetupWithManager(k8sManager)).To(Succeed()) Expect((&SolrBackupReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + Recorder: k8sManager.GetEventRecorderFor("solr-operator"), }).SetupWithManager(k8sManager)).To(Succeed()) go func() { diff --git a/controllers/util/solr_update_util.go b/controllers/util/solr_update_util.go index 31235441..647ef134 100644 --- a/controllers/util/solr_update_util.go +++ b/controllers/util/solr_update_util.go @@ -27,6 +27,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/tools/record" "net/url" "sort" "strings" @@ -548,7 +549,7 @@ func GetManagedSolrNodeNames(solrCloud *solr.SolrCloud, currentlyConfiguredPodCo // EvictReplicasForPodIfNecessary takes a solr Pod and migrates all replicas off of that Pod. // For updates this will only be called for pods using ephemeral data. // For scale-down operations, this can be called for pods using ephemeral or persistent data. -func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud *solr.SolrCloud, pod *corev1.Pod, podHasReplicas bool, evictionReason string, logger logr.Logger) (err error, canDeletePod bool, requestInProgress bool) { +func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud *solr.SolrCloud, pod *corev1.Pod, podHasReplicas bool, evictionReason string, recorder record.EventRecorder, logger logr.Logger) (err error, canDeletePod bool, requestInProgress bool) { logger = logger.WithValues("evictionReason", evictionReason) // If the Cloud has 1 or zero pods, and this is the "-0" pod, then delete the data since we can't move it anywhere else // Otherwise, move the replicas to other pods @@ -581,6 +582,8 @@ func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud *solr.SolrClo } if err == nil { logger.Info("Migrating all replicas off of pod before deletion.", "requestId", requestId, "pod", pod.Name) + recorder.Eventf(solrCloud, corev1.EventTypeNormal, "ReplicaMigrationStarted", + "Migrating all replicas off of pod %s before deletion (%s)", pod.Name, evictionReason) requestInProgress = true } else { logger.Error(err, "Could not migrate all replicas off of pod before deletion. Will try again.") @@ -595,8 +598,12 @@ func EvictReplicasForPodIfNecessary(ctx context.Context, solrCloud *solr.SolrClo if asyncState == "completed" { canDeletePod = true logger.Info("Migration of all replicas off of pod before deletion complete. Pod can now be deleted.", "pod", pod.Name) + recorder.Eventf(solrCloud, corev1.EventTypeNormal, "ReplicaMigrationComplete", + "Migration of all replicas off of pod %s is complete; the pod can now be deleted (%s)", pod.Name, evictionReason) } else if asyncState == "failed" { logger.Info("Migration of all replicas off of pod before deletion failed. Will try again.", "pod", pod.Name, "message", message) + recorder.Eventf(solrCloud, corev1.EventTypeWarning, "ReplicaMigrationFailed", + "Migration of all replicas off of pod %s failed and will be retried: %s", pod.Name, message) } else { requestInProgress = true } diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml index a214d509..90794fb5 100644 --- a/helm/solr-operator/Chart.yaml +++ b/helm/solr-operator/Chart.yaml @@ -62,6 +62,13 @@ annotations: url: https://github.com/apache/solr-operator/issues/709 - name: Github PR url: https://github.com/apache/solr-operator/pull/712 + - kind: added + description: The Solr Operator now creates events for Solr resources, giving users much more transparency on what is happening behind the scenes. + links: + - name: Github Issue + url: https://github.com/apache/solr-operator/issues/120 + - name: Github PR + url: https://github.com/apache/solr-operator/pull/836 - kind: changed description: A container PostStart Hook is no longer used to create the ZooKeeper ChRoot, instead the initContainer will manage this links: diff --git a/helm/solr-operator/templates/role.yaml b/helm/solr-operator/templates/role.yaml index 6a267a08..861ad0c7 100644 --- a/helm/solr-operator/templates/role.yaml +++ b/helm/solr-operator/templates/role.yaml @@ -43,6 +43,13 @@ rules: - services/status verbs: - get +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch - apiGroups: - "" resources: diff --git a/main.go b/main.go index d504995b..27003dfd 100644 --- a/main.go +++ b/main.go @@ -201,22 +201,24 @@ func main() { if err = (&controllers.SolrCloudReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("solrcloud-controller"), + Recorder: mgr.GetEventRecorderFor("solr-operator"), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "SolrCloud") os.Exit(1) } if err = (&controllers.SolrPrometheusExporterReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("solr-operator"), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "SolrPrometheusExporter") os.Exit(1) } if err = (&controllers.SolrBackupReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Config: mgr.GetConfig(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Config: mgr.GetConfig(), + Recorder: mgr.GetEventRecorderFor("solr-operator"), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "SolrBackup") os.Exit(1) diff --git a/tests/e2e/backups_test.go b/tests/e2e/backups_test.go index 7c3c7e13..0e41de39 100644 --- a/tests/e2e/backups_test.go +++ b/tests/e2e/backups_test.go @@ -119,6 +119,10 @@ var _ = FDescribe("E2E - Backups", Ordered, func() { Expect(foundSolrBackup.Status.History).To(HaveLen(solrBackup.Spec.Recurrence.MaxSaved), "The SolrBackup does not have the correct number of saved backups in its status") Expect(foundSolrBackup.Status.History[len(foundSolrBackup.Status.History)-1].Successful).To(PointTo(BeTrue()), "The latest backup was not successful") + By("checking that recurring backup events were recorded on the SolrBackup") + expectEvent(ctx, solrBackup, corev1.EventTypeNormal, "BackupStarted") + expectEvent(ctx, solrBackup, corev1.EventTypeNormal, "BackupRescheduled") + lastBackupId := 0 checkBackup(ctx, solrCloud, solrBackup, func(g Gomega, collection string, backupListResponse *solr_api.SolrBackupListResponse) { g.Expect(backupListResponse.Backups).To(HaveLen(3), "The wrong number of recurring backups have been saved") @@ -156,6 +160,10 @@ var _ = FDescribe("E2E - Backups", Ordered, func() { g.Expect(backup.Status.Successful).To(PointTo(BeTrue()), "Backup did not successfully complete") }) + By("checking that backup lifecycle events were recorded on the SolrBackup") + expectEvent(ctx, solrBackup, corev1.EventTypeNormal, "BackupStarted") + expectEvent(ctx, solrBackup, corev1.EventTypeNormal, "BackupSucceeded") + checkBackup(ctx, solrCloud, solrBackup, func(g Gomega, collection string, backupListResponse *solr_api.SolrBackupListResponse) { g.Expect(backupListResponse.Backups).To(HaveLen(1), "A non-recurring backupList should have a length of 1") }) diff --git a/tests/e2e/resource_utils_test.go b/tests/e2e/resource_utils_test.go index 31b88c63..bef49efa 100644 --- a/tests/e2e/resource_utils_test.go +++ b/tests/e2e/resource_utils_test.go @@ -53,6 +53,26 @@ func resourceKey(parentResource client.Object, name string) types.NamespacedName return types.NamespacedName{Name: name, Namespace: parentResource.GetNamespace()} } +// expectEvent waits until at least one Kubernetes Event of the given type and reason has been +// recorded against the given object. Events are matched on the involved object's UID so that +// events left over from previous specs (Events are not garbage-collected with the object) cannot +// cause a false positive. +func expectEvent(ctx context.Context, parentResource client.Object, eventType string, reason string, additionalOffset ...int) { + EventuallyWithOffset(resolveOffset(additionalOffset), func(g Gomega) { + eventList := &corev1.EventList{} + g.Expect(k8sClient.List(ctx, eventList, client.InNamespace(parentResource.GetNamespace()))).To(Succeed()) + matched := false + for i := range eventList.Items { + e := &eventList.Items[i] + if e.InvolvedObject.UID == parentResource.GetUID() && e.Type == eventType && e.Reason == reason { + matched = true + break + } + } + g.Expect(matched).To(BeTrue(), "Expected a %q event with reason %q to be recorded on %s/%s", eventType, reason, parentResource.GetNamespace(), parentResource.GetName()) + }).Should(Succeed()) +} + func deleteAndWait(ctx context.Context, object client.Object, additionalOffset ...int) { key := resourceKey(object, object.GetName()) kinds, _, err := k8sClient.Scheme().ObjectKinds(object) diff --git a/tests/e2e/solrcloud_rolling_upgrade_test.go b/tests/e2e/solrcloud_rolling_upgrade_test.go index c56951dc..55b6d3bc 100644 --- a/tests/e2e/solrcloud_rolling_upgrade_test.go +++ b/tests/e2e/solrcloud_rolling_upgrade_test.go @@ -24,6 +24,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/controller-runtime/pkg/client" "time" @@ -172,6 +173,10 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades", func() { By("checking that the collections can be queried after the restart") queryCollection(ctx, solrCloud, solrCollection1, 0) queryCollection(ctx, solrCloud, solrCollection2, 0) + + By("checking that rolling update cluster-operation events were recorded on the SolrCloud") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationStarted") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationComplete") }) }) }) diff --git a/tests/e2e/solrcloud_scaling_test.go b/tests/e2e/solrcloud_scaling_test.go index 26bba2ac..78ac80ff 100644 --- a/tests/e2e/solrcloud_scaling_test.go +++ b/tests/e2e/solrcloud_scaling_test.go @@ -137,6 +137,12 @@ var _ = FDescribe("E2E - SolrCloud - Scale Down", func() { expectNoPod(ctx, solrCloud, solrCloud.GetSolrPodName(1)) queryCollection(ctx, solrCloud, solrCollection1, 0) queryCollection(ctx, solrCloud, solrCollection2, 0) + + By("checking that managed scale-down events were recorded on the SolrCloud") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationStarted") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ReplicaMigrationStarted") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ReplicaMigrationComplete") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationComplete") }) }) @@ -306,6 +312,9 @@ var _ = FDescribe("E2E - SolrCloud - Scale Down", func() { expectNoPod(ctx, solrCloud, solrCloud.GetSolrPodName(1)) queryCollectionWithNoReplicaAvailable(ctx, solrCloud, solrCollection1) + + By("checking that an unmanaged scaling event was recorded on the SolrCloud") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ScalingUnmanaged") }) }) }) @@ -388,6 +397,10 @@ var _ = FDescribe("E2E - SolrCloud - Scale Up", func() { queryCollection(ctx, solrCloud, solrCollection1, 0) queryCollection(ctx, solrCloud, solrCollection2, 0) + + By("checking that managed scale-up events were recorded on the SolrCloud") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationStarted") + expectEvent(ctx, solrCloud, corev1.EventTypeNormal, "ClusterOperationComplete") }) })