From 13d10694ee5c52f1da6cdd91baf922787c7cbe08 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Fri, 9 Jan 2026 08:59:08 +0100 Subject: [PATCH 01/20] feat: chaos test implementation Based on the old bash tests. --- Makefile | 43 ++ docs/proposals/e2e-chaos-test-design.md | 501 ++++++++++++++++++++++++ scripts/report-test.py | 163 ++++++++ test/chaos/chaos_suite_test.go | 413 +++++++++++++++++++ test/chaos/framework/clients.go | 144 +++++++ test/chaos/framework/cluster.go | 259 ++++++++++++ test/chaos/framework/crd.go | 118 ++++++ test/chaos/framework/k6.go | 261 ++++++++++++ test/chaos/framework/namespace.go | 101 +++++ test/chaos/framework/operator.go | 134 +++++++ test/chaos/framework/operator_setup.go | 224 +++++++++++ test/chaos/framework/readiness.go | 154 ++++++++ test/chaos/framework/redis_chaos.go | 274 +++++++++++++ test/chaos/helpers_test.go | 73 ++++ test/chaos/k6.Dockerfile | 18 + test/chaos/k6scripts/test-300k.js | 57 +++ test/chaos/suite_test.go | 114 ++++++ 17 files changed, 3051 insertions(+) create mode 100644 docs/proposals/e2e-chaos-test-design.md create mode 100755 scripts/report-test.py create mode 100644 test/chaos/chaos_suite_test.go create mode 100644 test/chaos/framework/clients.go create mode 100644 test/chaos/framework/cluster.go create mode 100644 test/chaos/framework/crd.go create mode 100644 test/chaos/framework/k6.go create mode 100644 test/chaos/framework/namespace.go create mode 100644 test/chaos/framework/operator.go create mode 100644 test/chaos/framework/operator_setup.go create mode 100644 test/chaos/framework/readiness.go create mode 100644 test/chaos/framework/redis_chaos.go create mode 100644 test/chaos/helpers_test.go create mode 100644 test/chaos/k6.Dockerfile create mode 100644 test/chaos/k6scripts/test-300k.js create mode 100644 test/chaos/suite_test.go diff --git a/Makefile b/Makefile index f0cff40..c4eb169 100644 --- a/Makefile +++ b/Makefile @@ -588,3 +588,46 @@ test-e2e-cov: process-manifests-crd ginkgo ## Execute e2e application test with $(GINKGO_ENV) ginkgo \ -cover -covermode=count -coverprofile=$(TEST_COVERAGE_PROFILE_OUTPUT_FILE) -output-dir=$(TEST_COVERAGE_PROFILE_OUTPUT) \ $(GINKGO_OPTS) $(GINKGO_PACKAGES) + +##@ Chaos Testing + +K6_IMG ?= localhost:5001/redkey-k6:dev +CHAOS_DURATION ?= 10m +CHAOS_SEED ?= +CHAOS_TIMEOUT ?= 30m +CHAOS_PACKAGES ?= ./test/chaos +CHAOS_TEST_OUTPUT = .local/chaos-test.json + +.PHONY: k6-build +k6-build: ## Build k6 image with xk6-redis extension + $(info $(M) building k6 docker image with redis extension) + docker build -t $(K6_IMG) -f test/chaos/k6.Dockerfile test/chaos + +.PHONY: k6-push +k6-push: k6-build ## Push k6 image to local registry + $(info $(M) pushing k6 image) + docker push $(K6_IMG) + +.PHONY: test-chaos +test-chaos: process-manifests-crd ginkgo k6-push ## Execute chaos tests + $(info $(M) running chaos tests...) + @mkdir -p $(dir $(CHAOS_TEST_OUTPUT)) + $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_DURATION=$(CHAOS_DURATION) \ + $(if $(CHAOS_SEED),CHAOS_SEED=$(CHAOS_SEED),) \ + ginkgo \ + --timeout=$(CHAOS_TIMEOUT) \ + --json-report=$(CHAOS_TEST_OUTPUT) \ + $(GINKGO_OPTS) \ + --procs=1 \ + $(CHAOS_PACKAGES) + +.PHONY: test-chaos-focus +test-chaos-focus: process-manifests-crd ginkgo k6-push ## Run specific chaos test by name + $(info $(M) running focused chaos test: $(FOCUS)) + $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_DURATION=$(CHAOS_DURATION) \ + $(if $(CHAOS_SEED),CHAOS_SEED=$(CHAOS_SEED),) \ + ginkgo \ + --timeout=$(CHAOS_TIMEOUT) \ + --focus="$(FOCUS)" \ + --procs=1 \ + $(CHAOS_PACKAGES) diff --git a/docs/proposals/e2e-chaos-test-design.md b/docs/proposals/e2e-chaos-test-design.md new file mode 100644 index 0000000..b865498 --- /dev/null +++ b/docs/proposals/e2e-chaos-test-design.md @@ -0,0 +1,501 @@ +# E2E Chaos Test Design + +## Overview + +This document describes the design for end-to-end chaos tests for the Redis Kubernetes Operator (Redkey Operator). The +chaos tests replace the legacy bash-based tests with modern, maintainable Go + Ginkgo tests +executable via `make test-chaos`. + +**Key Constraints:** +- All new code goes in `/test/chaos/` (do not modify `/test/e2e/`) +- Code may be copied from `/test/e2e/` to `/test/chaos/` (annotate if done) +- When modifying Redis configuration, scale down both the **operator** AND **robin** deployments + +--- + +## 1. Objectives & Non-Goals + +### Objectives + +Build **end-to-end chaos tests** that validate **Redis cluster resilience** and **operator self-healing** under +disruptive events **while the cluster is under write/read load**. + +The suite must: +- Run via `make test-chaos` +- Be Go + Ginkgo v2 (same repo conventions as `test/e2e`) +- Treat the BASH tests as the **behavioral specification** +- Be deterministic enough for CI (randomness with seed capture + bounded actions) +- Use **Behavior Driven Development (BDD)** style with explicit, high-level test descriptions + +### Non-Goals + +- Not replacing the existing functional E2E suite in `test/e2e/` +- Not adding a full "chaos framework" abstraction layer +- Not implementing sophisticated fault injection (e.g., tc/netem) until pod-delete & topology-corruption tests are stable + +--- + +### 2.1 Key Behaviors from `test-chaos.sh` + +The main chaos loop validates: +1. **Continuous chaos loop**: Scale up → delete N random pods → wait ready → scale down → repeat +2. **k6 load running throughout** the chaos duration +3. **Recovery verification** after each disruption + +### 2.2 Test Scenarios Covered + +| Script | Behavior Validated | Failure Scenario | +|------------------------------------------|-------------------------------------------------------|--------------------------------------------------------------| +| `test-chaos.sh` | Continuous scale up/down with pod deletion under load | Random pod deletion, random scaling (1-N replicas), k6 load | +| `test-slot-busy.sh` | Slot ownership conflict resolution | Delete slots from all nodes, reassign to different owners | +| `test-migrating-in-the-air.sh` | Mid-migration slot recovery | Set slot to migrating/importing state, stop operator, resume | +| `test-with-a-master-becoming-a-slave.sh` | Primary → replica role change recovery | Delete slots, force replication of a primary | +| `test-scaling-up-with-load.sh` | Scale up during continuous k6 load | Scale to N replicas mid-test | +| `test-scaling-up-and-delete.sh` | Scale up followed by pod deletion | Scale to N, delete pods, verify recovery | + +### 2.3 Operator/Robin Outage Pattern + +From legacy scripts, when modifying Redis cluster state directly: +1. **Scale operator to 0** (stop reconciliation) +2. **Scale robin to 0** (stop sidecar management) +3. Perform disruptive action via `redis-cli` +4. **Scale robin to 1** (restore sidecar) +5. **Scale operator to 1** (resume reconciliation) +6. Verify cluster heals + +--- + +## 3. k6 Load Testing Strategy + +### 3.1 Workload Characteristics (`test-300k.js`) + +- Uses `k6/x/redis` (requires **xk6-redis** extension) +- Connects to Redis **cluster** via list of nodes from `REDIS_HOSTS` +- Each iteration: + - `SET` a key with TTL=30s + - Either `DEL` (~10%) or `GET` (~90%) + - Random sleep up to 100ms +- Stressors: + - Large values (up to ~300KB) → stresses rebalance/migration edge cases + - Steady churn with TTL → stresses cluster changes during active keyspace changes + +### 3.2 k6 Execution Mode + +k6 runs as a **Kubernetes Job** inside the same namespace as the cluster under test. + +Benefits: +- No port-forwarding / node IP assumptions +- Stable DNS/service discovery +- Logs captured in pod logs and emitted on test failure + +--- + +## 4. Chaos Test Scenarios + +### 4.1 BDD-Style Test Structure + +Each test case follows **Behavior Driven Development** style with explicit, high-level descriptions: + +```go +Describe("Chaos Under Load", func() { + It("survives continuous scaling and pod deletion while handling traffic", func() { + // Given: A ready Redis cluster with k6 load running + // When: Chaos loop executes (scale, delete pods, wait recovery) + // Then: Cluster remains healthy, k6 completes successfully + }) +}) +``` + +### 4.2 Scenario Definitions + +#### Scenario 1: Continuous Chaos Under Load (maps: `test-chaos.sh`) + +```gherkin +Given a 5-primary Redis cluster with k6 load running +When the chaos loop executes for the configured duration: + - Scale to random(5, 15) primaries + - Delete random(1, currentPrimaries/2) pods + - Wait for Ready status + - Scale down to random(3, 5) primaries + - Wait for Ready status +Then the cluster status is OK + All 16384 slots are assigned + k6 completes without fatal errors +``` + +#### Scenario 2: Chaos with Operator Deletion + +```gherkin +Given a ready Redis cluster with k6 load running +When chaos actions include: + - Delete operator pod (deployment recreates it) + - Delete random redis pods + - Scale cluster up/down + - Wait for Ready after each action +Then the operator recovers and heals the cluster + k6 completes successfully +``` + +#### Scenario 3: Chaos with Robin Deletion + +```gherkin +Given a ready Redis cluster with k6 load running +When chaos actions include: + - Delete robin pods from random redis pods + - Delete random redis pods + - Wait for Ready after each action +Then robin pods are recreated + Cluster heals to Ready status +``` + +#### Scenario 4: Full Chaos (Operator + Robin + Redis) + +```gherkin +Given a ready Redis cluster with k6 load running +When chaos actions include random combinations of: + - Delete operator pod + - Delete robin pods + - Delete redis pods + - Scale cluster up/down + - Wait for Ready between major disruptions +Then all components recover + Cluster reaches Ready status + k6 completes with acceptable error rate +``` + +#### Scenario 5: Slot Ownership Conflict Recovery + +```gherkin +Given a 5-primary Ready cluster +When operator and robin are scaled to 0 + Slot ownership is corrupted (CLUSTER DELSLOTS + SETSLOT inconsistent) + Robin is scaled to 1 + Operator is scaled to 1 +Then the cluster heals within timeout + Slot has exactly one owner + Status is Ready +``` + +#### Scenario 6: Mid-Migration Slot Recovery + +```gherkin +Given a 3-primary Ready cluster +When operator and robin are scaled to 0 + A slot is put in migrating/importing state across nodes + Robin is scaled to 1 + Operator is scaled to 1 +Then the cluster heals + No slots remain in migrating/importing state + Status is Ready +``` + +#### Scenario 7: Primary to Replica Demotion Recovery + +```gherkin +Given a 3-primary Ready cluster +When operator and robin are scaled to 0 + A primary node's slots are deleted + The node is forced to replicate another primary + Robin is scaled to 1 + Operator is scaled to 1 +Then the cluster heals + Original topology is restored (3 primaries) + Status is Ready +``` + +--- + +## 5. Readiness & Assertion Strategy + +### 5.1 Enhanced Readiness Gate + +The chaos tests require a stronger `WaitForReady` than the current E2E helper. This directly matches the bash `wait_redis_ready` definition: + +```go +// WaitForChaosReady waits for the Redis cluster to be fully healthy. +// It checks: +// 1. CR .status.status == "Ready" +// 2. All redis pods pass `redis-cli --cluster check` +// 3. No nodes show `fail` or `->` markers in `cluster nodes` output +func WaitForChaosReady(ctx context.Context, client client.Client, namespace, clusterName string, timeout time.Duration) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + // 1. Check CR status + cluster := &redkeyv1.RedkeyCluster{} + if err := client.Get(ctx, types.NamespacedName{Name: clusterName, Namespace: namespace}, cluster); err != nil { + return false, nil + } + if cluster.Status.Status != "Ready" { + return false, nil + } + + // 2. List redis pods + pods := &corev1.PodList{} + if err := client.List(ctx, pods, + client.InNamespace(namespace), + client.MatchingLabels{"redis.redkeycluster.operator/component": "redis"}); err != nil { + return false, nil + } + + // 3. For each pod, verify cluster health + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + return false, nil + } + if !clusterCheckPasses(ctx, namespace, pod.Name) { + return false, nil + } + if clusterNodesHasFailure(ctx, namespace, pod.Name) { + return false, nil + } + } + return true, nil + }) +} +``` + +### 5.2 k6 Success Criteria + +**Minimum**: Job completes with exit code 0. + +**Recommended** (future enhancement): Parse k6 summary for error-rate thresholds. + +--- + +## 6. Usage Examples + +| Command | Purpose | +|--------------------------------------------------|----------------------------------| +| `make test-chaos` | Run all chaos tests (sequential) | +| `make test-chaos-focus FOCUS="Continuous Chaos"` | Run single scenario | +| `make test-chaos CHAOS_DURATION=5m` | Short chaos duration | +| `make test-chaos CHAOS_SEED=12345` | Reproducible random seed | +| `make k6-build` | Build k6 image only | + +--- + +## 7. k6 Image Build Strategy + +### 7.1 Dockerfile (`test/chaos/k6.Dockerfile`) + +```dockerfile +FROM golang:1.24-alpine AS builder + +RUN go install go.k6.io/xk6/cmd/xk6@latest +RUN xk6 build \ + --with github.com/grafana/xk6-redis \ + --output /k6 + +FROM alpine:3.21 +COPY --from=builder /k6 /usr/bin/k6 +COPY k6scripts/ /scripts/ +ENTRYPOINT ["/usr/bin/k6"] +``` + +### 7.2 k6 Job Template + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: k6-load-{{ .Name }} +spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: k6 + image: {{ .K6Image }} + args: + - run + - /scripts/test-300k.js + - --duration + - {{ .Duration }} + - --vus + - {{ .VUs }} + env: + - name: REDIS_HOSTS + value: {{ .RedisHosts }} +``` + +--- + +## 8. File Structure + +``` +test/chaos/ +├── suite_test.go # Ginkgo suite bootstrap (minimal) +├── chaos_suite_test.go # Main chaos test scenarios (BDD style) +├── helpers.go # Test-level helper functions +├── framework/ +│ ├── cluster.go # RedkeyCluster creation helpers +│ ├── k6.go # k6 Job creation and monitoring +│ ├── namespace.go # Namespace creation/deletion helpers +│ ├── operator.go # Operator scaling helpers (ScaleOperatorDown/Up) +│ ├── operator_setup.go # Operator deployment setup (RBAC, ConfigMap, Deployment) +│ ├── readiness.go # Enhanced WaitForChaosReady + remote command execution +│ └── redis_chaos.go # Pod deletion, robin scaling, slot manipulation +├── k6.Dockerfile # k6 image build with xk6-redis +└── k6scripts/ + └── test-300k.js # Copied from OLD_BASH_TESTS/k6scripts +``` + +**Note**: Framework helpers are adapted from `test/e2e/framework/` with modifications for chaos-specific needs. Source files are annotated in comments. + +--- + +## 9. Test Execution Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Ginkgo Test Lifecycle │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ BeforeSuite: │ +│ 1. Verify CRD installed │ +│ 2. Verify k6 image available │ +│ 3. Initialize random seed (from CHAOS_SEED or GinkgoSeed) │ +│ │ +│ BeforeEach (per test): │ +│ 1. Create isolated namespace │ +│ 2. Deploy operator in namespace │ +│ 3. Create RedkeyCluster (N primaries + robin) │ +│ 4. Wait for Ready status │ +│ │ +│ Test Body (BDD style): │ +│ Given: Cluster is ready │ +│ When: [Optional] Start k6 load Job │ +│ Chaos Action Loop: │ +│ 1. PerformChaosAction() - random selection │ +│ 2. WaitForChaosReady() │ +│ 3. Repeat until duration expires │ +│ Then: Assertions │ +│ - Cluster status OK │ +│ - All 16384 slots assigned │ +│ - No nodes in fail state │ +│ - k6 Job succeeded (if started) │ +│ │ +│ AfterEach: │ +│ 1. Collect logs on failure (operator, k6, redis pods) │ +│ 2. Delete k6 Job (if exists) │ +│ 3. Delete RedkeyCluster (remove finalizers) │ +│ 4. Delete namespace │ +│ │ +│ AfterSuite: │ +│ 1. Cleanup any orphaned resources │ +│ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 10. Helper Functions + +### 10.1 Framework Helpers + +```go +// Cluster Setup +func CreateAndWaitForReadyCluster(ctx, client, namespace, name string, primaries int) error +func EnsureOperatorRunning(ctx, client, namespace string) error + +// Chaos Actions +func DeleteRandomRedisPods(ctx, client, namespace, clusterName string, count int) error +func DeleteOperatorPod(ctx, client, namespace string) error +func DeleteRobinPods(ctx, client, namespace, clusterName string, count int) error +func ScaleCluster(ctx, client, namespace, clusterName string, primaries int) error +func ScaleOperatorDown(ctx, client, namespace string) error +func ScaleOperatorUp(ctx, client, namespace string) error +func ScaleRobinDown(ctx, client, namespace, clusterName string) error +func ScaleRobinUp(ctx, client, namespace, clusterName string) error + +// Slot Corruption (requires operator/robin down) +func CorruptSlotOwnership(ctx, namespace, clusterName string, slot int) error +func SetSlotMigrating(ctx, namespace, clusterName string, slot int) error +func ForcePrimaryToReplica(ctx, namespace, clusterName string, podName string) error + +// k6 Load +func StartK6LoadJob(ctx, client, namespace, clusterName string, duration time.Duration, vus int) error +func WaitForK6JobCompletion(ctx, client, namespace, jobName string) error +func GetK6JobLogs(ctx, client, namespace, jobName string) (string, error) + +// Assertions +func WaitForChaosReady(ctx, client, namespace, clusterName string, timeout time.Duration) error +func AssertAllSlotsAssigned(ctx, namespace, clusterName string) error +func AssertNoNodesInFailState(ctx, namespace, clusterName string) error +``` + +### 10.2 Test-Level Helpers + +```go +// startK6OrFail starts a k6 load job and fails the test if it errors. +func startK6OrFail(c client.Client, namespace, clusterName string, duration time.Duration, vus int) string + +// cleanupK6Job safely deletes a k6 job, ignoring errors. +func cleanupK6Job(c client.Client, namespace, jobName string) + +// chaosLoop runs a chaos function repeatedly until the duration expires. +func chaosLoop(duration time.Duration, chaosFn func(iteration int)) + +// verifyClusterHealthy runs all cluster health checks. +func verifyClusterHealthy(c client.Client, namespace, clusterName string) + +// verifyK6Completed waits for k6 job to complete successfully. +func verifyK6Completed(c client.Client, namespace, jobName string, timeout time.Duration) +``` + +--- + +## 11. Architectural Decisions + +### 11.1 Design Rationale + +| Decision | Rationale | +|----------------------------------------|-----------------------------------------------------------------------------------------| +| **Sequential execution** (`--procs=1`) | Chaos tests modify shared cluster state; parallel execution would cause race conditions | +| **k6 as Job (not sidecar)** | Jobs have clear success/failure semantics; easier to check exit code | +| **Operator in-namespace** | Isolates tests completely; each test has its own operator instance | +| **Remote command via exec** | Direct redis-cli execution for slot manipulation; no external dependencies | +| **No DescribeTable for chaos** | Each scenario has unique setup/teardown; tables obscure failure points | +| **Explicit phase assertions** | Trace status transitions to catch improper healing behavior | +| **Scale down operator AND robin** | For topology corruption tests, both must stop to prevent interference | + +### 11.2 Anti-Patterns Avoided + +| Anti-Pattern | Alternative Used | Benefit | +|------------------------------------|----------------------------------------------|------------------------------------------| +| DescribeTable with complex entries | Separate `It()` blocks per scenario | Clear test intent, easier debugging | +| Anonymous function validators | Named helper functions in framework | Reusable, testable in isolation | +| Magic timeouts | Configurable via environment/Makefile | Adaptable to different environments | +| Shelling out to kubectl | controller-runtime client + client-go exec | Type-safe, no path dependencies | +| Hard-coded sleeps | `Eventually` / polling helpers with timeouts | Reliable, explicit timeout behavior | +| Host-local k6 | k6 runs in cluster via Job | Consistent environment, no network hacks | +| Unbounded randomness | Actions seeded with `GinkgoRandomSeed()` | Reproducible failures | + +--- + +## 12. Environment Variables + +| Variable | Default | Purpose | +|------------------|--------------------------------|----------------------------------------| +| `K6_IMG` | `localhost:5001/redkey-k6:dev` | k6 container image | +| `CHAOS_TIMEOUT` | `30m` | Maximum Ginkgo test timeout | +| `CHAOS_DURATION` | `10m` | k6 load duration / chaos loop duration | +| `CHAOS_SEED` | (auto) | Random seed for reproducibility | +| `CHAOS_VUS` | `10` | k6 virtual users | +| `OPERATOR_IMAGE` | From main Makefile | Operator image for tests | +| `ROBIN_IMAGE` | From main Makefile | Robin image for tests | + +--- + + ## 13. Migration Map + +| Legacy Script | New Ginkgo Test | Notes | +|------------------------------------------|----------------------------------------------|-----------------------------------| +| ` test-chaos.sh` | `It("survives continuous chaos under load")` | Main chaos loop with k6 | +| `test-slot-busy.sh` | `It("heals slot ownership conflicts")` | Operator/Robin scale down pattern | +| `test-migrating-in-the-air.sh` | `It("recovers from mid-migration slots")` | Operator/Robin scale down pattern | +| `test-with-a-master-becoming-a-slave.sh` | `It("recovers from forced role change")` | Operator/Robin scale down pattern | +| `test-scaling-up-with-load.sh` | `It("scales up under k6 load")` | k6 Job integration | +| `test-scaling-up-and-delete.sh` | Covered by main chaos loop | Pod deletion after scaling | +| `test-size-change.sh` | Covered by existing E2E tests | No migration needed | +| `test-template-change.sh` | Covered by existing E2E tests | No migration needed | diff --git a/scripts/report-test.py b/scripts/report-test.py new file mode 100755 index 0000000..dc8f0c1 --- /dev/null +++ b/scripts/report-test.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Convert Ginkgo JSON test results to Markdown format.""" + +import json +import sys +import pathlib + +def escape_md(text: str) -> str: + """Escape special characters for markdown.""" + return text.replace("*", "\\*").replace("_", "\\_").replace("`", "\\`") + + +def format_runtime(nanoseconds: int) -> str: + """Convert nanoseconds to human-readable format.""" + seconds = nanoseconds / 1e9 + if seconds < 60: + return f"{seconds:.2f}s" + minutes = int(seconds // 60) + secs = seconds % 60 + return f"{minutes}m {secs:.2f}s" + + +def generate_md_report(data: dict) -> str: + """Generate markdown report from Ginkgo JSON data.""" + suite = data[0] + lines = [] + + # Header + lines.append("# " + suite["SuiteDescription"] + " Results") + lines.append("") + lines.append("**Date:** " + suite["StartTime"][:10]) + lines.append("") + + # Summary + specs = suite["SpecReports"] + passed = sum(1 for s in specs if s.get("State") == "passed" and s.get("LeafNodeType") == "It") + failed = sum(1 for s in specs if s.get("State") == "failed") + total = suite["PreRunStats"]["TotalSpecs"] + + status = "PASS" if suite["SuiteSucceeded"] else "FAIL" + lines.append(f"## {status}: {suite['SuiteDescription']} ({passed}/{total} passed)") + lines.append("") + + for spec in specs: + node_type = spec.get("LeafNodeType", "Unknown") + state = spec.get("State", "unknown") + text = spec.get("LeafNodeText", "") + hierarchy = spec.get("ContainerHierarchyTexts") or [] + runtime = spec.get("RunTime", 0) + location = spec.get("LeafNodeLocation", {}) + file_name = location.get("FileName", "") + line_number = location.get("LineNumber", "") + failure = spec.get("Failure") + captured_output = spec.get("CapturedGinkgoWriterOutput", "") + + # Skip BeforeSuite/AfterSuite + if node_type != "It": + continue + + # Build test name from hierarchy + text + full_name = " > ".join(hierarchy + [text]) if hierarchy else text + status = "PASS" if state in ("passed", "skipped") else "FAIL" + + if status == "PASS": # We do not want PASS tests + continue + + lines.append(f"### {status}: {escape_md(full_name)}") + lines.append("") + + # Add test location and runtime + if file_name: + short_file = file_name.split("redkeyoperator/")[-1] + lines.append(f"- **Test:** [{short_file}:{line_number}]({file_name}#L{line_number})") + lines.append(f"- **Runtime:** {format_runtime(runtime)}") + lines.append("") + + # Add failure details + if failure: + lines.append("#### Failure") + lines.append("") + fail_loc = failure.get("Location", {}) + if fail_loc: + fail_file = fail_loc.get("FileName", "") + fail_line = fail_loc.get("LineNumber", "") + short_fail = fail_file.split("redkeyoperator/")[-1] + lines.append(f"- **Location:** [{short_fail}:{fail_line}]({fail_file}#L{fail_line})") + lines.append("") + + lines.append("##### Message") + lines.append("") + lines.append("```") + lines.append(failure.get("Message", "Unknown error")) + lines.append("```") + + # Add stack trace if available + stack_trace = fail_loc.get("FullStackTrace", "") + if stack_trace: + lines.append("") + lines.append("##### Stack Trace") + lines.append("") + lines.append("```") + lines.append(stack_trace) + lines.append("```") + + # Add captured output if present + if captured_output and captured_output.strip(): + lines.append("") + lines.append("#### Captured Output") + lines.append("") + lines.append("```") + lines.append(captured_output.strip()) + lines.append("```") + + lines.append("") + + return "\n".join(lines) + + +def unique(path: str) -> str: + path = pathlib.Path(path) + + if not path.exists(): + return path + + stem = path.stem + suffix = path.suffix + parent = path.parent + + i = 1 + while True: + candidate = parent / f"{stem}_{i}{suffix}" + if not candidate.exists(): + return candidate + i += 1 + + + + +def main(): + if len(sys.argv) < 2: + input_file = ".local/results.json" + else: + input_file = sys.argv[1] + + output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace(".json", ".md") + + output_file = unique(output_file) + + print(f"Reading: {input_file}") + with open(input_file, "r") as f: + data = json.load(f) + + md_content = generate_md_report(data) + + print(f"Writing: {output_file}") + with open(output_file, "w") as f: + f.write(md_content) + + print("Done!") + + +if __name__ == "__main__": + main() diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go new file mode 100644 index 0000000..b0e0c36 --- /dev/null +++ b/test/chaos/chaos_suite_test.go @@ -0,0 +1,413 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package chaos + +import ( + "fmt" + "math/rand" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/inditextech/redkeyoperator/test/chaos/framework" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + corev1 "k8s.io/api/core/v1" +) + +const ( + clusterName = "redis-cluster" + defaultPrimaries = 5 + + // Chaos timing constants + chaosIterationDelay = 5 * time.Second // Delay between chaos iterations + chaosRateLimitDelay = 10 * time.Second // Delay for rate limiting between heavy operations + chaosReserveTime = 1 * time.Minute // Time reserved at end of chaos for final checks + k6CompletionBuffer = 5 * time.Minute // Buffer time for k6 job completion + operatorReadyTimeout = 2 * time.Minute // Timeout for operator to become ready + operatorPollInterval = 5 * time.Second // Poll interval for operator readiness + scaleAckTimeout = 30 * time.Second // Timeout for StatefulSet to acknowledge scale + scalePollInterval = 2 * time.Second // Poll interval for scale acknowledgment + diagnosticsLogTail = int64(100) // Number of log lines to capture for diagnostics + + // Scaling bounds + minPrimaries = 3 + maxPrimaries = 10 + defaultVUs = 10 // Number of virtual users for k6 load tests +) + +var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { + var ( + namespace *corev1.Namespace + k6JobName string + rng *rand.Rand + ) + + BeforeEach(func() { + var err error + + rng = rand.New(rand.NewSource(chaosSeed)) + GinkgoWriter.Printf("Using random seed: %d\n", chaosSeed) + + namespace, err = framework.CreateNamespace(ctx, k8sClientset, fmt.Sprintf("chaos-%d", GinkgoParallelProcess())) + Expect(err).NotTo(HaveOccurred(), "failed to create namespace") + + By("deploying operator in namespace") + Expect(framework.EnsureOperatorSetup(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + Eventually(func() bool { + dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redis-operator", metav1.GetOptions{}) + return err == nil && dep.Status.AvailableReplicas >= 1 + }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) + + By("creating Redis cluster with 5 primaries") + Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries)).To(Succeed()) + + By("waiting for cluster to be ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + collectDiagnostics(namespace.Name) + } + if k6JobName != "" { + _ = framework.DeleteK6Job(ctx, k8sClientset, namespace.Name, k6JobName) + } + _ = framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace) + }) + + // ================================================================================== + // Scenario 1: Continuous Chaos Under Load + // ================================================================================== + It("survives continuous scaling and pod deletion while handling traffic", func() { + By("starting k6 load job") + var err error + k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) + Expect(err).NotTo(HaveOccurred()) + + By("executing chaos loop") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: scaling cluster up", iteration)) + newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) + + // Poll for StatefulSet to acknowledge the scale instead of fixed sleep + Eventually(func() int32 { + replicas, _ := framework.GetStatefulSetReplicas(ctx, k8sClientset, namespace.Name, clusterName) + return replicas + }, scaleAckTimeout, scalePollInterval).Should(Equal(newSize)) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + deleteCount := rng.Intn(int(newSize)/2) + 1 + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, deleteCount, rng) + Expect(err).NotTo(HaveOccurred()) + GinkgoWriter.Printf("Deleted pods: %v\n", deleted) + + By(fmt.Sprintf("iteration %d: waiting for cluster recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: scaling cluster down", iteration)) + downSize := int32(rng.Intn(3) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, downSize)).To(Succeed()) + + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("verifying k6 job completed successfully") + Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + }) + + // ================================================================================== + // Scenario 2: Chaos with Operator Deletion + // ================================================================================== + It("recovers when operator pod is deleted during chaos", func() { + By("starting k6 load job") + var err error + k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) + Expect(err).NotTo(HaveOccurred()) + + By("executing chaos with operator deletion") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) + Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + _, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between iterations + time.Sleep(chaosRateLimitDelay) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + }) + + // ================================================================================== + // Scenario 3: Chaos with Robin Deletion + // ================================================================================== + It("recovers when robin pods are deleted during chaos", func() { + By("starting k6 load job") + var err error + k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) + Expect(err).NotTo(HaveOccurred()) + + By("executing chaos with robin deletion") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) + _, _ = framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + _, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between iterations + time.Sleep(chaosRateLimitDelay) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + }) + + // ================================================================================== + // Scenario 4: Full Chaos (Operator + Robin + Redis) + // ================================================================================== + It("recovers from full chaos deleting operator, robin, and redis pods", func() { + By("starting k6 load job") + var err error + k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) + Expect(err).NotTo(HaveOccurred()) + + By("executing full chaos") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Full chaos iteration %d ===\n", iteration) + + action := rng.Intn(4) + + switch action { + case 0: + By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) + Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) + case 1: + By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) + _, _ = framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + case 2: + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + _, _ = framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + case 3: + By(fmt.Sprintf("iteration %d: scaling cluster", iteration)) + newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) + } + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between chaos actions + time.Sleep(chaosIterationDelay) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + }) +}) + +var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), func() { + var ( + namespace *corev1.Namespace + ) + + BeforeEach(func() { + var err error + + namespace, err = framework.CreateNamespace(ctx, k8sClientset, fmt.Sprintf("chaos-topo-%d", GinkgoParallelProcess())) + Expect(err).NotTo(HaveOccurred(), "failed to create namespace") + + By("deploying operator in namespace") + Expect(framework.EnsureOperatorSetup(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + Eventually(func() bool { + dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redis-operator", metav1.GetOptions{}) + return err == nil && dep.Status.AvailableReplicas >= 1 + }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) + + By("creating Redis cluster with 5 primaries") + Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries)).To(Succeed()) + + By("waiting for cluster to be ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + }) + + AfterEach(func() { + if CurrentSpecReport().Failed() { + collectDiagnostics(namespace.Name) + } + _ = framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace) + }) + + // ================================================================================== + // Scenario 5: Slot Ownership Conflict Recovery + // ================================================================================== + It("heals slot ownership conflicts when operator and robin restart", func() { + By("verifying cluster is ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + By("scaling operator to 0") + Expect(framework.ScaleOperatorDown(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("scaling robin to 0") + Expect(framework.ScaleRobinDown(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("corrupting slot ownership via redis-cli") + Expect(framework.CorruptSlotOwnership(ctx, k8sClientset, namespace.Name, clusterName, 0)).To(Succeed()) + + By("scaling robin to 1") + Expect(framework.ScaleRobinUp(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("scaling operator to 1") + Expect(framework.ScaleOperatorUp(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("waiting for cluster to heal") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + }) + + // ================================================================================== + // Scenario 6: Mid-Migration Slot Recovery + // ================================================================================== + It("recovers from mid-migration slots when operator and robin restart", func() { + By("verifying cluster is ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + By("scaling operator to 0") + Expect(framework.ScaleOperatorDown(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("scaling robin to 0") + Expect(framework.ScaleRobinDown(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("setting slot to migrating state via redis-cli") + Expect(framework.SetSlotMigrating(ctx, k8sClientset, namespace.Name, clusterName, 100)).To(Succeed()) + + By("scaling robin to 1") + Expect(framework.ScaleRobinUp(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("scaling operator to 1") + Expect(framework.ScaleOperatorUp(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("waiting for cluster to heal") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + }) + + // ================================================================================== + // Scenario 7: Primary to Replica Demotion Recovery + // ================================================================================== + It("recovers from forced primary to replica demotion", func() { + By("verifying cluster is ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + + targetPod := clusterName + "-0" + + By("scaling operator to 0") + Expect(framework.ScaleOperatorDown(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("scaling robin to 0") + Expect(framework.ScaleRobinDown(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("forcing primary to become replica via redis-cli") + Expect(framework.ForcePrimaryToReplica(ctx, k8sClientset, namespace.Name, clusterName, targetPod)).To(Succeed()) + + By("scaling robin to 1") + Expect(framework.ScaleRobinUp(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + + By("scaling operator to 1") + Expect(framework.ScaleOperatorUp(ctx, k8sClientset, namespace.Name)).To(Succeed()) + + By("waiting for cluster to heal") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + }) +}) + +// collectDiagnostics collects logs and state for debugging failed tests. +func collectDiagnostics(namespace string) { + GinkgoWriter.Printf("\n=== COLLECTING DIAGNOSTICS FOR NAMESPACE %s ===\n", namespace) + + // Get cluster status + cluster, err := framework.GetRedkeyCluster(ctx, dynamicClient, namespace, clusterName) + if err == nil { + GinkgoWriter.Printf("Cluster status: %s\n", cluster.Status.Status) + GinkgoWriter.Printf("Cluster conditions: %+v\n", cluster.Status.Conditions) + } + + // List pods with status + pods, err := k8sClientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) + if err == nil { + GinkgoWriter.Printf("\nPods in namespace:\n") + for _, pod := range pods.Items { + GinkgoWriter.Printf(" %s: Phase=%s\n", pod.Name, pod.Status.Phase) + } + } + + // Capture operator pod logs + GinkgoWriter.Printf("\n--- Operator Pod Logs (last %d lines) ---\n", diagnosticsLogTail) + operatorLogs, err := framework.GetPodLogs(ctx, k8sClientset, namespace, framework.OperatorPodsSelector(), diagnosticsLogTail) + if err == nil { + GinkgoWriter.Printf("%s\n", operatorLogs) + } else { + GinkgoWriter.Printf("Failed to get operator logs: %v\n", err) + } + + // Capture first redis pod logs + GinkgoWriter.Printf("\n--- Redis Pod Logs (last %d lines, first pod) ---\n", diagnosticsLogTail) + redisLogs, err := framework.GetPodLogs(ctx, k8sClientset, namespace, framework.RedisPodsSelector(clusterName), diagnosticsLogTail) + if err == nil { + GinkgoWriter.Printf("%s\n", redisLogs) + } else { + GinkgoWriter.Printf("Failed to get redis logs: %v\n", err) + } + + GinkgoWriter.Printf("=== END DIAGNOSTICS ===\n\n") +} diff --git a/test/chaos/framework/clients.go b/test/chaos/framework/clients.go new file mode 100644 index 0000000..3b16959 --- /dev/null +++ b/test/chaos/framework/clients.go @@ -0,0 +1,144 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package framework + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "path/filepath" + "sync" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/client-go/tools/remotecommand" + "k8s.io/client-go/util/homedir" +) + +// Label selectors for Redis operator components. +const ( + redisComponentLabel = "redis.redkeycluster.operator/component" + clusterNameLabel = "redkey-cluster-name" +) + +// RedisPodsSelector returns the label selector for Redis pods in a cluster. +func RedisPodsSelector(clusterName string) string { + return fmt.Sprintf("%s=%s,%s=redis", clusterNameLabel, clusterName, redisComponentLabel) +} + +// RobinPodsSelector returns the label selector for Robin pods in a cluster. +func RobinPodsSelector(clusterName string) string { + return fmt.Sprintf("%s=%s,%s=robin", clusterNameLabel, clusterName, redisComponentLabel) +} + +// OperatorPodsSelector returns the label selector for operator pods. +func OperatorPodsSelector() string { + return "control-plane=redkey-operator" +} + +// Cached REST config for RemoteCommand. +var ( + cachedConfig *rest.Config + configOnce sync.Once + configErr error +) + +// getCachedConfig returns a cached REST config, creating it once. +func getCachedConfig() (*rest.Config, error) { + configOnce.Do(func() { + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + kubeconfig = filepath.Join(homedir.HomeDir(), ".kube", "config") + } + cachedConfig, configErr = clientcmd.BuildConfigFromFlags("", kubeconfig) + }) + return cachedConfig, configErr +} + +// RemoteCommand executes a command in a pod and returns stdout, stderr, error. +// Uses a cached REST config for efficiency. +func RemoteCommand(ctx context.Context, namespace, podName, command string) (string, string, error) { + config, err := getCachedConfig() + if err != nil { + return "", "", fmt.Errorf("get config: %w", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return "", "", fmt.Errorf("create clientset: %w", err) + } + + buf := &bytes.Buffer{} + errBuf := &bytes.Buffer{} + + request := clientset.CoreV1().RESTClient(). + Post(). + Namespace(namespace). + Resource("pods"). + Name(podName). + SubResource("exec"). + VersionedParams(&corev1.PodExecOptions{ + Stdout: true, + Stderr: true, + TTY: false, + Command: []string{"/bin/sh", "-c", command}, + }, scheme.ParameterCodec) + + executor, err := remotecommand.NewSPDYExecutor(config, "POST", request.URL()) + if err != nil { + return "", "", fmt.Errorf("create executor: %w", err) + } + + err = executor.StreamWithContext(ctx, remotecommand.StreamOptions{ + Stdin: nil, + Stdout: buf, + Stderr: errBuf, + Tty: false, + }) + if err != nil { + return buf.String(), errBuf.String(), err + } + return buf.String(), errBuf.String(), nil +} + +// GetPodLogs returns the last N lines of logs from pods matching the selector. +func GetPodLogs(ctx context.Context, clientset kubernetes.Interface, namespace, labelSelector string, tailLines int64) (string, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: labelSelector, + }) + if err != nil { + return "", err + } + + if len(pods.Items) == 0 { + return "", fmt.Errorf("no pods found matching %s", labelSelector) + } + + // Get logs from the first pod + pod := pods.Items[0] + opts := &corev1.PodLogOptions{ + TailLines: &tailLines, + } + + req := clientset.CoreV1().Pods(namespace).GetLogs(pod.Name, opts) + stream, err := req.Stream(ctx) + if err != nil { + return "", fmt.Errorf("stream logs from %s: %w", pod.Name, err) + } + defer stream.Close() + + buf := new(bytes.Buffer) + if _, err := io.Copy(buf, stream); err != nil { + return "", fmt.Errorf("read logs: %w", err) + } + + return buf.String(), nil +} diff --git a/test/chaos/framework/cluster.go b/test/chaos/framework/cluster.go new file mode 100644 index 0000000..754df20 --- /dev/null +++ b/test/chaos/framework/cluster.go @@ -0,0 +1,259 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +// NOTE: This file is adapted from test/e2e/framework/redisclient.go for chaos tests. +// It contains the Redis cluster creation and management functions. +package framework + +import ( + "context" + "fmt" + "os" + "time" + + redkeyv1 "github.com/inditextech/redkeyoperator/api/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/ptr" +) + +const ( + defaultConfig = `save "" +appendonly no +maxmemory 70mb` + defaultTimeout = 60 * time.Minute + defaultRedisImage = "redis:8.4.0" + defaultRobinImage = "localhost:5001/redkey-robin:dev" + version = "6.0.2" +) + +// GetRedisImage returns the redis image from environment or default. +func GetRedisImage() string { + if img := os.Getenv("REDIS_IMAGE"); img != "" { + return img + } + return defaultRedisImage +} + +// GetRobinImage returns the robin image from environment or default. +func GetRobinImage() string { + if img := os.Getenv("ROBIN_IMAGE"); img != "" { + return img + } + return defaultRobinImage +} + +// CreateRedkeyCluster creates a RedkeyCluster CR using dynamic client. +func CreateRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, name string, primaries int32) error { + key := types.NamespacedName{Namespace: namespace, Name: name} + rc := buildRedkeyCluster(key, primaries, 0, "", GetRedisImage(), true, true, redkeyv1.Pdb{}, redkeyv1.RedkeyClusterOverrideSpec{}) + return EnsureRedkeyCluster(ctx, dc, rc) +} + +// GetStatefulSetReplicas returns the current replica count for a cluster's StatefulSet. +func GetStatefulSetReplicas(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) (int32, error) { + sts, err := clientset.AppsV1().StatefulSets(namespace).Get(ctx, clusterName, metav1.GetOptions{}) + if err != nil { + return 0, err + } + if sts.Spec.Replicas == nil { + return 0, nil + } + return *sts.Spec.Replicas, nil +} + +// buildRedkeyCluster constructs a RedkeyCluster object with the given parameters. +func buildRedkeyCluster( + key types.NamespacedName, + primaries, replicasPerPrimary int32, + storage, image string, + purgeKeys, ephemeral bool, + pdb redkeyv1.Pdb, + userOverride redkeyv1.RedkeyClusterOverrideSpec, +) *redkeyv1.RedkeyCluster { + rc := &redkeyv1.RedkeyCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: key.Name, + Namespace: key.Namespace, + Labels: map[string]string{"team": "team-a"}, + }, + Spec: redkeyv1.RedkeyClusterSpec{ + Auth: redkeyv1.RedisAuth{}, + Version: version, + Primaries: primaries, + Ephemeral: ephemeral, + Image: image, + Config: defaultConfig, + Resources: buildResources(), + PurgeKeysOnRebalance: &purgeKeys, + }, + } + + if storage != "" { + rc.Spec.DeletePVC = ptr.To(true) + rc.Spec.Ephemeral = false + rc.Spec.Storage = storage + rc.Spec.AccessModes = []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce} + } + + if pdb != (redkeyv1.Pdb{}) { + rc.Spec.Pdb = pdb + } + + if replicasPerPrimary > 0 { + rc.Spec.ReplicasPerPrimary = replicasPerPrimary + } + + // Build override with security context + var ov redkeyv1.RedkeyClusterOverrideSpec + if userOverride.StatefulSet != nil || userOverride.Service != nil { + ov = userOverride + } + + if ov.StatefulSet == nil { + ov.StatefulSet = &redkeyv1.PartialStatefulSet{ + Spec: &redkeyv1.PartialStatefulSetSpec{ + Template: &redkeyv1.PartialPodTemplateSpec{}, + }, + } + } + if ov.StatefulSet.Spec == nil { + ov.StatefulSet.Spec = &redkeyv1.PartialStatefulSetSpec{ + Template: &redkeyv1.PartialPodTemplateSpec{}, + } + } + if ov.StatefulSet.Spec.Template == nil { + ov.StatefulSet.Spec.Template = &redkeyv1.PartialPodTemplateSpec{} + } + podSpec := &ov.StatefulSet.Spec.Template.Spec + if podSpec.SecurityContext == nil { + podSpec.SecurityContext = &corev1.PodSecurityContext{} + } + podSpec.SecurityContext.RunAsNonRoot = ptr.To(true) + podSpec.SecurityContext.RunAsUser = ptr.To(int64(1001)) + podSpec.SecurityContext.RunAsGroup = ptr.To(int64(1001)) + podSpec.SecurityContext.FSGroup = ptr.To(int64(1001)) + + rc.Spec.Override = &ov + + // Robin configuration + robinImage := GetRobinImage() + rc.Spec.Robin = &redkeyv1.RobinSpec{ + Template: &corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "robin", + Image: robinImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Ports: []corev1.ContainerPort{ + { + ContainerPort: 8080, + Name: "http", + Protocol: corev1.ProtocolTCP, + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: key.Name + "-robin-config", + MountPath: "/opt/conf/configmap", + }, + }, + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + SecurityContext: &corev1.SecurityContext{ + RunAsNonRoot: ptr.To(true), + RunAsUser: ptr.To(int64(1001)), + AllowPrivilegeEscalation: ptr.To(false), + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: key.Name + "-robin-config", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: key.Name + "-robin", + }, + DefaultMode: ptr.To(int32(420)), + }, + }, + }, + }, + }, + }, + Config: &redkeyv1.RobinConfig{ + Reconciler: &redkeyv1.RobinConfigReconciler{ + IntervalSeconds: ptr.To(30), + OperationCleanUpIntervalSeconds: ptr.To(30), + }, + Cluster: &redkeyv1.RobinConfigCluster{ + HealthProbePeriodSeconds: ptr.To(60), + HealingTimeSeconds: ptr.To(60), + MaxRetries: ptr.To(10), + BackOff: ptr.To(10), + }, + }, + } + + return rc +} + +// WaitForReady waits until the cluster status is Ready using dynamic client. +func WaitForReady(ctx context.Context, dc dynamic.Interface, key types.NamespacedName, timeout time.Duration) (*redkeyv1.RedkeyCluster, error) { + if timeout == 0 { + timeout = defaultTimeout + } + + var last string + if err := wait.PollUntilContextTimeout( + ctx, 3*time.Second, timeout, true, + func(ctx context.Context) (bool, error) { + rc, err := GetRedkeyCluster(ctx, dc, key.Namespace, key.Name) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return false, nil + } + last = rc.Status.Status + return last == redkeyv1.StatusReady, nil + }, + ); err != nil { + return nil, fmt.Errorf( + "timed out after %s waiting for Ready (last seen %q): %w", + timeout, last, err, + ) + } + + return GetRedkeyCluster(ctx, dc, key.Namespace, key.Name) +} + +func buildResources() *corev1.ResourceRequirements { + return &corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("256Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + } +} diff --git a/test/chaos/framework/crd.go b/test/chaos/framework/crd.go new file mode 100644 index 0000000..aeab143 --- /dev/null +++ b/test/chaos/framework/crd.go @@ -0,0 +1,118 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package framework + +import ( + "context" + "encoding/json" + "fmt" + + redkeyv1 "github.com/inditextech/redkeyoperator/api/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" +) + +// RedkeyClusterGVR is the GroupVersionResource for RedkeyCluster CRD. +var RedkeyClusterGVR = schema.GroupVersionResource{ + Group: "redis.inditex.dev", + Version: "v1", + Resource: "redkeyclusters", +} + +// GetRedkeyCluster retrieves a RedkeyCluster by name. +func GetRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, name string) (*redkeyv1.RedkeyCluster, error) { + unstr, err := dc.Resource(RedkeyClusterGVR).Namespace(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return nil, err + } + return unstructuredToRedkeyCluster(unstr) +} + +// CreateRedkeyClusterCR creates a RedkeyCluster CR using dynamic client. +func CreateRedkeyClusterCR(ctx context.Context, dc dynamic.Interface, rc *redkeyv1.RedkeyCluster) error { + unstr, err := redkeyClusterToUnstructured(rc) + if err != nil { + return fmt.Errorf("convert to unstructured: %w", err) + } + + _, err = dc.Resource(RedkeyClusterGVR).Namespace(rc.Namespace).Create(ctx, unstr, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("create RedkeyCluster %s/%s: %w", rc.Namespace, rc.Name, err) + } + return nil +} + +// UpdateRedkeyClusterCR updates an existing RedkeyCluster CR. +func UpdateRedkeyClusterCR(ctx context.Context, dc dynamic.Interface, rc *redkeyv1.RedkeyCluster) error { + unstr, err := redkeyClusterToUnstructured(rc) + if err != nil { + return fmt.Errorf("convert to unstructured: %w", err) + } + + _, err = dc.Resource(RedkeyClusterGVR).Namespace(rc.Namespace).Update(ctx, unstr, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("update RedkeyCluster %s/%s: %w", rc.Namespace, rc.Name, err) + } + return nil +} + +// EnsureRedkeyCluster creates or updates a RedkeyCluster CR. +func EnsureRedkeyCluster(ctx context.Context, dc dynamic.Interface, rc *redkeyv1.RedkeyCluster) error { + existing, err := GetRedkeyCluster(ctx, dc, rc.Namespace, rc.Name) + if err != nil { + if errors.IsNotFound(err) { + return CreateRedkeyClusterCR(ctx, dc, rc) + } + return err + } + + // Update existing - preserve resourceVersion + rc.ResourceVersion = existing.ResourceVersion + return UpdateRedkeyClusterCR(ctx, dc, rc) +} + +// ScaleRedkeyCluster updates the Primaries field of a RedkeyCluster. +func ScaleRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, name string, primaries int32) error { + rc, err := GetRedkeyCluster(ctx, dc, namespace, name) + if err != nil { + return fmt.Errorf("get RedkeyCluster: %w", err) + } + + rc.Spec.Primaries = primaries + return UpdateRedkeyClusterCR(ctx, dc, rc) +} + +// redkeyClusterToUnstructured converts a RedkeyCluster to unstructured. +func redkeyClusterToUnstructured(rc *redkeyv1.RedkeyCluster) (*unstructured.Unstructured, error) { + // Ensure TypeMeta is set + rc.TypeMeta = metav1.TypeMeta{ + APIVersion: "redis.inditex.dev/v1", + Kind: "RedkeyCluster", + } + + data, err := json.Marshal(rc) + if err != nil { + return nil, err + } + + obj := &unstructured.Unstructured{} + if err := json.Unmarshal(data, &obj.Object); err != nil { + return nil, err + } + return obj, nil +} + +// unstructuredToRedkeyCluster converts unstructured to RedkeyCluster. +func unstructuredToRedkeyCluster(unstr *unstructured.Unstructured) (*redkeyv1.RedkeyCluster, error) { + rc := &redkeyv1.RedkeyCluster{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(unstr.Object, rc); err != nil { + return nil, err + } + return rc, nil +} diff --git a/test/chaos/framework/k6.go b/test/chaos/framework/k6.go new file mode 100644 index 0000000..4bafa27 --- /dev/null +++ b/test/chaos/framework/k6.go @@ -0,0 +1,261 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package framework + +import ( + "context" + "fmt" + "io" + "os" + "strings" + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/ptr" +) + +const ( + defaultK6Image = "localhost:5001/redkey-k6:dev" + k6JobTimeout = 30 * time.Minute + k6StartupTimeout = 2 * time.Minute + defaultK6VUs = 10 + k6ScriptConfigMap = "k6-scripts" +) + +// GetK6Image returns the k6 image from environment or default. +func GetK6Image() string { + if img := os.Getenv("K6_IMG"); img != "" { + return img + } + return defaultK6Image +} + +// StartK6LoadJob creates and starts a k6 Job for load testing. +// Returns the job name. +func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, duration time.Duration, vus int) (string, error) { + if vus <= 0 { + vus = defaultK6VUs + } + + // Get Redis pod IPs for REDIS_HOSTS + redisHosts, err := getRedisHosts(ctx, clientset, namespace, clusterName) + if err != nil { + return "", fmt.Errorf("failed to get redis hosts: %w", err) + } + + jobName := fmt.Sprintf("k6-load-%s", clusterName) + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: jobName, + Namespace: namespace, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(0)), + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "app": "k6-load", + "redkey-cluster-name": clusterName, + }, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + Containers: []corev1.Container{ + { + Name: "k6", + Image: GetK6Image(), + Args: []string{ + "run", + "/scripts/test-300k.js", + "--duration", formatDuration(duration), + "--vus", fmt.Sprintf("%d", vus), + }, + Env: []corev1.EnvVar{ + { + Name: "REDIS_HOSTS", + Value: redisHosts, + }, + }, + }, + }, + }, + }, + }, + } + + // Delete existing job if present + propagation := metav1.DeletePropagationForeground + _ = clientset.BatchV1().Jobs(namespace).Delete(ctx, jobName, metav1.DeleteOptions{ + PropagationPolicy: &propagation, + }) + + // Wait for deletion + _ = wait.PollUntilContextTimeout(ctx, time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { + _, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) + return errors.IsNotFound(err), nil + }) + + if _, err := clientset.BatchV1().Jobs(namespace).Create(ctx, job, metav1.CreateOptions{}); err != nil { + return "", fmt.Errorf("failed to create k6 job: %w", err) + } + + // Wait for job pod to start + err = wait.PollUntilContextTimeout(ctx, 2*time.Second, k6StartupTimeout, true, func(ctx context.Context) (bool, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app=k6-load", + }) + if err != nil { + return false, nil + } + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodRunning { + return true, nil + } + } + return false, nil + }) + if err != nil { + return jobName, fmt.Errorf("k6 job pod did not start: %w", err) + } + + return jobName, nil +} + +// WaitForK6JobCompletion waits for the k6 job to complete successfully. +func WaitForK6JobCompletion(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string, timeout time.Duration) error { + if timeout == 0 { + timeout = k6JobTimeout + } + + return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { + job, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return false, fmt.Errorf("k6 job not found") + } + return false, nil + } + + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { + return true, nil + } + if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { + return false, fmt.Errorf("k6 job failed: %s", condition.Message) + } + } + + return false, nil + }) +} + +// DeleteK6Job deletes the k6 job and its pods. +func DeleteK6Job(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string) error { + if jobName == "" { + return nil + } + + propagation := metav1.DeletePropagationForeground + err := clientset.BatchV1().Jobs(namespace).Delete(ctx, jobName, metav1.DeleteOptions{ + PropagationPolicy: &propagation, + }) + if errors.IsNotFound(err) { + return nil + } + return err +} + +// GetK6JobLogs returns the logs from the k6 job pod. +func GetK6JobLogs(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string) (string, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("app=k6-load,job-name=%s", jobName), + }) + if err != nil { + return "", err + } + + if len(pods.Items) == 0 { + // Try alternative label selector + pods, err = clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: "app=k6-load", + }) + if err != nil { + return "", err + } + } + + if len(pods.Items) == 0 { + return "", fmt.Errorf("no k6 pods found") + } + + // Get logs from the first pod using proper log API + pod := pods.Items[0] + tailLines := int64(1000) + opts := &corev1.PodLogOptions{ + TailLines: &tailLines, + } + req := clientset.CoreV1().Pods(namespace).GetLogs(pod.Name, opts) + stream, err := req.Stream(ctx) + if err != nil { + return fmt.Sprintf("Pod %s completed but failed to get logs: %v", pod.Name, err), nil + } + defer stream.Close() + + var buf strings.Builder + if _, err := io.Copy(&buf, stream); err != nil { + return fmt.Sprintf("Pod %s completed but failed to read logs: %v", pod.Name, err), nil + } + + return buf.String(), nil +} + +// getRedisHosts returns a comma-separated list of redis host:port for k6. +func getRedisHosts(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) (string, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("redkey-cluster-name=%s,redis.redkeycluster.operator/component=redis", clusterName), + }) + if err != nil { + return "", err + } + + if len(pods.Items) == 0 { + return "", fmt.Errorf("no redis pods found") + } + + var hosts []string + for _, pod := range pods.Items { + if pod.Status.PodIP != "" { + hosts = append(hosts, fmt.Sprintf("%s:6379", pod.Status.PodIP)) + } + } + + if len(hosts) == 0 { + return "", fmt.Errorf("no redis pod IPs found") + } + + return strings.Join(hosts, ","), nil +} + +// formatDuration formats a duration for k6 (e.g., "10m", "1h30m"). +func formatDuration(d time.Duration) string { + if d < time.Minute { + return fmt.Sprintf("%ds", int(d.Seconds())) + } + if d < time.Hour { + return fmt.Sprintf("%dm", int(d.Minutes())) + } + hours := int(d.Hours()) + minutes := int(d.Minutes()) % 60 + if minutes == 0 { + return fmt.Sprintf("%dh", hours) + } + return fmt.Sprintf("%dh%dm", hours, minutes) +} diff --git a/test/chaos/framework/namespace.go b/test/chaos/framework/namespace.go new file mode 100644 index 0000000..395dfdf --- /dev/null +++ b/test/chaos/framework/namespace.go @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +// NOTE: This file is adapted from test/e2e/framework/namespace.go for chaos tests. +package framework + +import ( + "context" + "fmt" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" +) + +const ( + defaultNamespaceWait = 100 * time.Second + defaultNamespacePoll = 1 * time.Second +) + +// CreateNamespace creates a namespace with a GenerateName prefix and waits for it to be ready. +func CreateNamespace(ctx context.Context, clientset kubernetes.Interface, prefix string) (*corev1.Namespace, error) { + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: prefix + "-", + }, + } + created, err := clientset.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) + if err != nil { + return nil, err + } + + err = wait.PollUntilContextTimeout(ctx, defaultNamespacePoll, defaultNamespaceWait, true, func(ctx context.Context) (bool, error) { + _, err := clientset.CoreV1().Namespaces().Get(ctx, created.Name, metav1.GetOptions{}) + return err == nil, nil + }) + if err != nil { + return nil, fmt.Errorf("failed to wait for namespace %s to be ready: %w", created.Name, err) + } + return created, nil +} + +// DeleteNamespace tears down everything in the namespace, including +// RedkeyCluster CRs with finalizers, then deletes the namespace itself. +func DeleteNamespace(ctx context.Context, clientset kubernetes.Interface, dc dynamic.Interface, ns *corev1.Namespace) error { + if ns == nil { + return nil + } + + // 1) Remove any RedkeyCluster CRs so their finalizers don't stall namespace deletion + rcList, err := dc.Resource(RedkeyClusterGVR).Namespace(ns.Name).List(ctx, metav1.ListOptions{}) + if err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("list RedkeyClusters in %s: %w", ns.Name, err) + } + + if rcList != nil { + for _, item := range rcList.Items { + name := item.GetName() + + // Strip finalizers with retry + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + rc, err := dc.Resource(RedkeyClusterGVR).Namespace(ns.Name).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err + } + rc.SetFinalizers(nil) + _, err = dc.Resource(RedkeyClusterGVR).Namespace(ns.Name).Update(ctx, rc, metav1.UpdateOptions{}) + return err + }) + if err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("removing finalizers from %s/%s: %w", ns.Name, name, err) + } + + // Delete the CR immediately + if err := dc.Resource(RedkeyClusterGVR).Namespace(ns.Name).Delete(ctx, name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("deleting RedkeyCluster %s/%s: %w", ns.Name, name, err) + } + } + } + + // 2) Delete the namespace + if err := clientset.CoreV1().Namespaces().Delete(ctx, ns.Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("deleting namespace %s: %w", ns.Name, err) + } + + // 3) Wait for the namespace to actually disappear + err = wait.PollUntilContextTimeout(ctx, defaultNamespacePoll, defaultNamespaceWait, true, func(ctx context.Context) (bool, error) { + _, err := clientset.CoreV1().Namespaces().Get(ctx, ns.Name, metav1.GetOptions{}) + return errors.IsNotFound(err), nil + }) + if err != nil { + return fmt.Errorf("namespace %s should be gone: %w", ns.Name, err) + } + return nil +} diff --git a/test/chaos/framework/operator.go b/test/chaos/framework/operator.go new file mode 100644 index 0000000..ca60b7f --- /dev/null +++ b/test/chaos/framework/operator.go @@ -0,0 +1,134 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package framework + +import ( + "context" + "fmt" + "os" + "time" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/ptr" +) + +const ( + operatorDeploymentName = "redis-operator" + operatorScaleTimeout = 2 * time.Minute + + defaultOperatorImage = "localhost:5001/redkey-operator:dev" +) + +// GetOperatorImage returns the operator image from environment or default. +func GetOperatorImage() string { + if img := os.Getenv("OPERATOR_IMAGE"); img != "" { + return img + } + return defaultOperatorImage +} + +// ScaleOperatorDown scales the operator deployment to 0 replicas and waits for termination. +func ScaleOperatorDown(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + return scaleDeploymentNative(ctx, clientset, namespace, operatorDeploymentName, 0) +} + +// ScaleOperatorUp scales the operator deployment to 1 replica and waits for readiness. +func ScaleOperatorUp(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + return scaleDeploymentNative(ctx, clientset, namespace, operatorDeploymentName, 1) +} + +// DeleteOperatorPod deletes the operator pod (the deployment will recreate it). +func DeleteOperatorPod(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: OperatorPodsSelector(), + }) + if err != nil { + return fmt.Errorf("failed to list operator pods: %w", err) + } + + if len(pods.Items) == 0 { + return nil + } + + for _, pod := range pods.Items { + if err := clientset.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete operator pod %s: %w", pod.Name, err) + } + } + + // Wait for at least one operator pod to be ready again + return wait.PollUntilContextTimeout(ctx, 2*time.Second, operatorScaleTimeout, true, func(ctx context.Context) (bool, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: OperatorPodsSelector(), + }) + if err != nil { + return false, nil + } + + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodRunning && isPodReady(&pod) { + return true, nil + } + } + return false, nil + }) +} + +// scaleDeploymentNative scales a deployment to the desired replica count using native client-go. +func scaleDeploymentNative(ctx context.Context, clientset kubernetes.Interface, namespace, name string, replicas int32) error { + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get deployment %s: %w", name, err) + } + + dep.Spec.Replicas = ptr.To(replicas) + if _, err := clientset.AppsV1().Deployments(namespace).Update(ctx, dep, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to scale deployment %s: %w", name, err) + } + + if replicas == 0 { + return waitForDeploymentScaleDownNative(ctx, clientset, namespace, name) + } + return waitForDeploymentReadyNative(ctx, clientset, namespace, name, replicas) +} + +// waitForDeploymentScaleDownNative waits until no pods exist for the deployment. +func waitForDeploymentScaleDownNative(ctx context.Context, clientset kubernetes.Interface, namespace, name string) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, operatorScaleTimeout, true, func(ctx context.Context) (bool, error) { + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return true, nil + } + return false, nil + } + return dep.Status.Replicas == 0, nil + }) +} + +// waitForDeploymentReadyNative waits until the deployment has the desired ready replicas. +func waitForDeploymentReadyNative(ctx context.Context, clientset kubernetes.Interface, namespace, name string, replicas int32) error { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, operatorScaleTimeout, true, func(ctx context.Context) (bool, error) { + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return false, nil + } + return dep.Status.ReadyReplicas >= replicas, nil + }) +} + +// isPodReady returns true if all containers in the pod are ready. +func isPodReady(pod *corev1.Pod) bool { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} diff --git a/test/chaos/framework/operator_setup.go b/test/chaos/framework/operator_setup.go new file mode 100644 index 0000000..2538ad8 --- /dev/null +++ b/test/chaos/framework/operator_setup.go @@ -0,0 +1,224 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +// NOTE: This file is adapted from test/e2e/operator_test.go for chaos tests. +// It contains the operator setup functions needed to deploy the operator in the test namespace. +package framework + +import ( + "context" + "fmt" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/ptr" +) + +// EnsureOperatorSetup creates the ServiceAccount, RBAC, ConfigMap and Deployment for the operator. +func EnsureOperatorSetup(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + // Create ServiceAccount + if _, err := clientset.CoreV1().ServiceAccounts(namespace).Create(ctx, newServiceAccount(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure ServiceAccount: %w", err) + } + + // Create Roles + if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "leader-election-role", leaderElectionPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure leader-election-role: %w", err) + } + if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "redis-operator-role", operatorPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure redis-operator-role: %w", err) + } + + // Create RoleBindings + if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "leader-election-rolebinding", "leader-election-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure leader-election-rolebinding: %w", err) + } + if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "redis-operator-rolebinding", "redis-operator-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure redis-operator-rolebinding: %w", err) + } + + // Create ConfigMap + if _, err := clientset.CoreV1().ConfigMaps(namespace).Create(ctx, newConfigMap(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure ConfigMap: %w", err) + } + + // Create Deployment + if _, err := clientset.AppsV1().Deployments(namespace).Create(ctx, newOperatorDeployment(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure Deployment: %w", err) + } + + return nil +} + +func newServiceAccount(ns string) *corev1.ServiceAccount { + return &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{ + Name: "redis-operator-sa", + Namespace: ns, + }, + } +} + +func newRole(ns, name string, rules []rbacv1.PolicyRule) *rbacv1.Role { + return &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + Rules: rules, + } +} + +func newRoleBinding(ns, name, roleName string) *rbacv1.RoleBinding { + return &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, + RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: roleName}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: "redis-operator-sa", Namespace: ns}}, + } +} + +func newConfigMap(ns string) *corev1.ConfigMap { + return &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{Name: "redis-operator-config", Namespace: ns}, + Data: map[string]string{ + "redis_operator_config.yaml": `apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 +kind: ControllerManagerConfig +health: + healthProbeBindAddress: ":8081" +metrics: + bindAddress: "127.0.0.1:8080" +leaderElection: + leaderElect: true + resourceName: db95d8a6.inditex.com +`, + }, + } +} + +func newOperatorDeployment(ns string) *appsv1.Deployment { + replicas := int32(1) + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "redis-operator", + Namespace: ns, + Labels: map[string]string{"control-plane": "redkey-operator"}, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: &replicas, + Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"control-plane": "redkey-operator"}}, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "control-plane": "redkey-operator", + "domain": "DOMAIN", + "environment": "ENVIRONMENT", + "layer": "middleware-redkeyoperator", + "slot": "default", + "tenant": "TENANT", + "type": "middleware", + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: "redis-operator-sa", + SecurityContext: &corev1.PodSecurityContext{RunAsNonRoot: ptr.To(true)}, + TerminationGracePeriodSeconds: ptr.To(int64(10)), + Containers: []corev1.Container{newOperatorContainer(ns)}, + }, + }, + }, + } +} + +func newOperatorContainer(ns string) corev1.Container { + return corev1.Container{ + Name: "redis-operator", + Image: GetOperatorImage(), + Command: []string{"/manager"}, + Args: []string{"--leader-elect", "--max-concurrent-reconciles", "10"}, + Env: []corev1.EnvVar{{Name: "WATCH_NAMESPACE", Value: ns}}, + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{AllowPrivilegeEscalation: ptr.To(false)}, + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("500m"), corev1.ResourceMemory: resource.MustParse("500Mi")}, + Requests: corev1.ResourceList{corev1.ResourceCPU: resource.MustParse("300m"), corev1.ResourceMemory: resource.MustParse("250Mi")}, + }, + } +} + +func leaderElectionPolicyRules() []rbacv1.PolicyRule { + return []rbacv1.PolicyRule{ + { + APIGroups: []string{"coordination.k8s.io"}, + Resources: []string{"leases"}, + Verbs: []string{"get", "list", "watch", "create", "update", "patch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"configmaps"}, + Verbs: []string{"get", "list", "watch", "create", "update", "patch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"events"}, + Verbs: []string{"create", "patch"}, + }, + } +} + +func operatorPolicyRules() []rbacv1.PolicyRule { + return []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"configmaps", "pods", "services"}, + Verbs: []string{"get", "list", "watch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"configmaps", "services"}, + Verbs: []string{"create", "update", "delete", "get", "list"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"events"}, + Verbs: []string{"create", "patch"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"secrets"}, + Verbs: []string{"get"}, + }, + { + APIGroups: []string{""}, + Resources: []string{"persistentvolumeclaims"}, + Verbs: []string{"get", "list", "watch", "delete", "deletecollection"}, + }, + { + APIGroups: []string{"apps"}, + Resources: []string{"deployments", "statefulsets"}, + Verbs: []string{"create", "delete", "get", "list", "patch", "update", "watch"}, + }, + { + APIGroups: []string{"policy"}, + Resources: []string{"poddisruptionbudgets"}, + Verbs: []string{"create", "delete", "get", "list", "patch", "update", "watch"}, + }, + { + APIGroups: []string{"redis.inditex.dev"}, + Resources: []string{"redkeyclusters"}, + Verbs: []string{"create", "delete", "get", "list", "patch", "update", "watch"}, + }, + { + APIGroups: []string{"redis.inditex.dev"}, + Resources: []string{"redkeyclusters/finalizers"}, + Verbs: []string{"update"}, + }, + { + APIGroups: []string{"redis.inditex.dev"}, + Resources: []string{"redkeyclusters/status"}, + Verbs: []string{"get", "patch", "update"}, + }, + } +} diff --git a/test/chaos/framework/readiness.go b/test/chaos/framework/readiness.go new file mode 100644 index 0000000..83365ed --- /dev/null +++ b/test/chaos/framework/readiness.go @@ -0,0 +1,154 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +// Package framework provides helper functions for chaos tests. +package framework + +import ( + "context" + "fmt" + "strings" + "time" + + redkeyv1 "github.com/inditextech/redkeyoperator/api/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" +) + +const ( + defaultChaosReadyTimeout = 10 * time.Minute + pollInterval = 2 * time.Second + maxConsecutiveErrors = 10 +) + +// WaitForChaosReady waits for the Redis cluster to be fully healthy. +// Checks: CR status == Ready, redis-cli --cluster check passes, no fail/migrating states. +func WaitForChaosReady(ctx context.Context, dc dynamic.Interface, clientset kubernetes.Interface, namespace, clusterName string, timeout time.Duration) error { + if timeout == 0 { + timeout = defaultChaosReadyTimeout + } + + var consecutiveErrors int + var lastErr error + + return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) { + // 1. Check CR status + cluster, err := GetRedkeyCluster(ctx, dc, namespace, clusterName) + if err != nil { + consecutiveErrors++ + lastErr = err + if !errors.IsNotFound(err) && consecutiveErrors > maxConsecutiveErrors { + return false, fmt.Errorf("persistent error getting cluster (after %d attempts): %w", consecutiveErrors, lastErr) + } + return false, nil + } + consecutiveErrors = 0 + + if cluster.Status.Status != redkeyv1.StatusReady { + return false, nil + } + + // 2. List redis pods + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return false, nil + } + + if len(pods.Items) == 0 { + return false, nil + } + + // 3. For each running pod, verify cluster health + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + return false, nil + } + + if !clusterCheckPasses(ctx, namespace, pod.Name) { + return false, nil + } + + if clusterNodesHasFailure(ctx, namespace, pod.Name) { + return false, nil + } + } + return true, nil + }) +} + +// clusterCheckPasses runs redis-cli --cluster check and returns true if it succeeds. +func clusterCheckPasses(ctx context.Context, namespace, podName string) bool { + stdout, _, err := RemoteCommand(ctx, namespace, podName, "redis-cli --cluster check localhost:6379") + if err != nil { + return false + } + return !strings.Contains(stdout, "[ERR]") +} + +// clusterNodesHasFailure checks if any node is in fail state or has migrating slots. +func clusterNodesHasFailure(ctx context.Context, namespace, podName string) bool { + stdout, _, err := RemoteCommand(ctx, namespace, podName, "redis-cli cluster nodes") + if err != nil { + return true + } + return strings.Contains(stdout, "fail") || strings.Contains(stdout, "->") +} + +// AssertAllSlotsAssigned verifies that all 16384 slots are assigned. +func AssertAllSlotsAssigned(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return err + } + + if len(pods.Items) == 0 { + return fmt.Errorf("no redis pods found") + } + + stdout, _, err := RemoteCommand(ctx, namespace, pods.Items[0].Name, "redis-cli cluster info") + if err != nil { + return fmt.Errorf("failed to get cluster info: %w", err) + } + + if !strings.Contains(stdout, "cluster_slots_ok:16384") { + return fmt.Errorf("not all slots assigned: %s", stdout) + } + + return nil +} + +// AssertNoNodesInFailState verifies no nodes are in fail state. +func AssertNoNodesInFailState(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return err + } + + if len(pods.Items) == 0 { + return fmt.Errorf("no redis pods found") + } + + for _, pod := range pods.Items { + stdout, _, err := RemoteCommand(ctx, namespace, pod.Name, "redis-cli cluster nodes") + if err != nil { + return fmt.Errorf("failed to get cluster nodes from %s: %w", pod.Name, err) + } + + if strings.Contains(stdout, "fail") { + return fmt.Errorf("node in fail state detected in pod %s: %s", pod.Name, stdout) + } + } + + return nil +} diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go new file mode 100644 index 0000000..698906c --- /dev/null +++ b/test/chaos/framework/redis_chaos.go @@ -0,0 +1,274 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package framework + +import ( + "context" + "fmt" + "math/rand" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/util/retry" + "k8s.io/utils/ptr" +) + +const ( + robinScaleTimeout = 2 * time.Minute +) + +// DeleteRandomRedisPods deletes N random redis pods from the cluster. +func DeleteRandomRedisPods(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, count int, rng *rand.Rand) ([]string, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return nil, fmt.Errorf("failed to list redis pods: %w", err) + } + + if len(pods.Items) == 0 { + return nil, fmt.Errorf("no redis pods found to delete") + } + + // Limit count to available pods (never delete all) + maxDelete := len(pods.Items) - 1 + if maxDelete < 1 { + maxDelete = 1 + } + if count > maxDelete { + count = maxDelete + } + + // Shuffle and pick N pods + indices := rng.Perm(len(pods.Items)) + + var deleted []string + for i := 0; i < count && i < len(indices); i++ { + pod := pods.Items[indices[i]] + if err := clientset.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return deleted, fmt.Errorf("failed to delete pod %s: %w", pod.Name, err) + } + deleted = append(deleted, pod.Name) + } + + return deleted, nil +} + +// DeleteRobinPods deletes N random robin pods from the cluster. +func DeleteRobinPods(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, count int, rng *rand.Rand) ([]string, error) { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RobinPodsSelector(clusterName), + }) + if err != nil { + return nil, fmt.Errorf("failed to list robin pods: %w", err) + } + + if len(pods.Items) == 0 { + return nil, nil + } + + // Shuffle and pick N pods + indices := rng.Perm(len(pods.Items)) + + var deleted []string + for i := 0; i < count && i < len(indices); i++ { + pod := pods.Items[indices[i]] + if err := clientset.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { + return deleted, fmt.Errorf("failed to delete robin pod %s: %w", pod.Name, err) + } + deleted = append(deleted, pod.Name) + } + + return deleted, nil +} + +// ScaleCluster scales the Redis cluster to the specified number of primaries. +func ScaleCluster(ctx context.Context, dc dynamic.Interface, namespace, clusterName string, primaries int32) error { + return retry.RetryOnConflict(retry.DefaultRetry, func() error { + return ScaleRedkeyCluster(ctx, dc, namespace, clusterName, primaries) + }) +} + +// ScaleRobinDown scales robin deployment to 0 replicas. +func ScaleRobinDown(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) error { + return scaleRobinDeploymentNative(ctx, clientset, namespace, clusterName, 0) +} + +// ScaleRobinUp scales robin deployment to 1 replica. +func ScaleRobinUp(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) error { + return scaleRobinDeploymentNative(ctx, clientset, namespace, clusterName, 1) +} + +// scaleRobinDeploymentNative finds and scales the robin deployment using native client-go. +func scaleRobinDeploymentNative(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, replicas int32) error { + robinDepName := clusterName + "-robin" + + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, robinDepName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) && replicas == 0 { + return nil + } + return fmt.Errorf("failed to get robin deployment %s: %w", robinDepName, err) + } + + dep.Spec.Replicas = ptr.To(replicas) + if _, err := clientset.AppsV1().Deployments(namespace).Update(ctx, dep, metav1.UpdateOptions{}); err != nil { + return fmt.Errorf("failed to scale robin deployment %s: %w", robinDepName, err) + } + + if replicas == 0 { + return wait.PollUntilContextTimeout(ctx, 2*time.Second, robinScaleTimeout, true, func(ctx context.Context) (bool, error) { + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, robinDepName, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + return true, nil + } + return false, nil + } + return dep.Status.Replicas == 0, nil + }) + } + + return wait.PollUntilContextTimeout(ctx, 2*time.Second, robinScaleTimeout, true, func(ctx context.Context) (bool, error) { + dep, err := clientset.AppsV1().Deployments(namespace).Get(ctx, robinDepName, metav1.GetOptions{}) + if err != nil { + return false, nil + } + return dep.Status.ReadyReplicas >= replicas, nil + }) +} + +// CorruptSlotOwnership corrupts slot ownership by removing a slot and assigning it inconsistently. +// This requires operator and robin to be scaled down first. +func CorruptSlotOwnership(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, slot int) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return fmt.Errorf("failed to list redis pods: %w", err) + } + + if len(pods.Items) < 2 { + return fmt.Errorf("need at least 2 pods for slot corruption") + } + + // Get node IDs + nodeIDs := make([]string, 0, len(pods.Items)) + for _, pod := range pods.Items { + stdout, _, err := RemoteCommand(ctx, namespace, pod.Name, "redis-cli cluster nodes | grep myself | awk '{ print $1 }'") + if err != nil { + return fmt.Errorf("failed to get node ID from %s: %w", pod.Name, err) + } + nodeIDs = append(nodeIDs, trimNewline(stdout)) + } + + // Delete slot from all nodes + for _, pod := range pods.Items { + _, _, _ = RemoteCommand(ctx, namespace, pod.Name, fmt.Sprintf("redis-cli cluster delslots %d", slot)) + } + + // Assign slot to different nodes inconsistently (first two nodes) + _, _, err = RemoteCommand(ctx, namespace, pods.Items[0].Name, fmt.Sprintf("redis-cli cluster setslot %d node %s", slot, nodeIDs[0])) + if err != nil { + return fmt.Errorf("failed to setslot on first node: %w", err) + } + + _, _, err = RemoteCommand(ctx, namespace, pods.Items[1].Name, fmt.Sprintf("redis-cli cluster setslot %d node %s", slot, nodeIDs[1])) + if err != nil { + return fmt.Errorf("failed to setslot on second node: %w", err) + } + + return nil +} + +// SetSlotMigrating puts a slot in migrating/importing state across nodes. +// This requires operator and robin to be scaled down first. +func SetSlotMigrating(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, slot int) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return fmt.Errorf("failed to list redis pods: %w", err) + } + + if len(pods.Items) < 2 { + return fmt.Errorf("need at least 2 pods for slot migration corruption") + } + + // Get node IDs + nodeIDs := make([]string, 0, len(pods.Items)) + for _, pod := range pods.Items { + stdout, _, err := RemoteCommand(ctx, namespace, pod.Name, "redis-cli cluster nodes | grep myself | awk '{ print $1 }'") + if err != nil { + return fmt.Errorf("failed to get node ID from %s: %w", pod.Name, err) + } + nodeIDs = append(nodeIDs, trimNewline(stdout)) + } + + // Set slot as migrating from node0 to node1 + _, _, err = RemoteCommand(ctx, namespace, pods.Items[0].Name, fmt.Sprintf("redis-cli cluster setslot %d migrating %s", slot, nodeIDs[1])) + if err != nil { + return fmt.Errorf("failed to set slot migrating: %w", err) + } + + // Set slot as importing on node1 from node0 + _, _, err = RemoteCommand(ctx, namespace, pods.Items[1].Name, fmt.Sprintf("redis-cli cluster setslot %d importing %s", slot, nodeIDs[0])) + if err != nil { + return fmt.Errorf("failed to set slot importing: %w", err) + } + + return nil +} + +// ForcePrimaryToReplica forces a primary node to become a replica of another primary. +// This requires operator and robin to be scaled down first. +func ForcePrimaryToReplica(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName, podName string) error { + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return fmt.Errorf("failed to list redis pods: %w", err) + } + + // Find another primary to replicate + var targetNodeID string + for _, pod := range pods.Items { + if pod.Name == podName { + continue + } + stdout, _, err := RemoteCommand(ctx, namespace, pod.Name, "redis-cli cluster nodes | grep myself | awk '{ print $1 }'") + if err != nil { + continue + } + targetNodeID = trimNewline(stdout) + break + } + + if targetNodeID == "" { + return fmt.Errorf("no target node found for replication") + } + + // Delete all slots from the pod + _, _, _ = RemoteCommand(ctx, namespace, podName, "redis-cli cluster flushslots") + + // Make it replicate the target + _, _, err = RemoteCommand(ctx, namespace, podName, fmt.Sprintf("redis-cli cluster replicate %s", targetNodeID)) + if err != nil { + return fmt.Errorf("failed to replicate: %w", err) + } + + return nil +} + +func trimNewline(s string) string { + for len(s) > 0 && (s[len(s)-1] == '\n' || s[len(s)-1] == '\r') { + s = s[:len(s)-1] + } + return s +} diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go new file mode 100644 index 0000000..c45961e --- /dev/null +++ b/test/chaos/helpers_test.go @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package chaos + +import ( + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/inditextech/redkeyoperator/test/chaos/framework" +) + +// startK6OrFail starts a k6 load job and fails the test if it errors. +func startK6OrFail(namespace, clusterName string, duration time.Duration, vus int) string { + jobName, err := framework.StartK6LoadJob(ctx, k8sClientset, namespace, clusterName, duration, vus) + Expect(err).NotTo(HaveOccurred(), "failed to start k6 job") + return jobName +} + +// cleanupK6Job safely deletes a k6 job, ignoring errors. +func cleanupK6Job(namespace, jobName string) { + if jobName == "" { + return + } + _ = framework.DeleteK6Job(ctx, k8sClientset, namespace, jobName) +} + +// chaosLoop runs a chaos function repeatedly until the duration expires. +// Reserves time at the end for final checks. +func chaosLoop(duration time.Duration, chaosFn func(iteration int)) { + endTime := time.Now().Add(duration - chaosReserveTime) + iteration := 0 + + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + chaosFn(iteration) + } +} + +// verifyClusterHealthy runs all cluster health checks. +func verifyClusterHealthy(namespace, clusterName string) { + By("verifying cluster readiness") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + + By("verifying all slots assigned") + Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) + + By("verifying no nodes in fail state") + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) +} + +// verifyK6Completed waits for k6 job to complete successfully. +func verifyK6Completed(namespace, jobName string, timeout time.Duration) { + By("verifying k6 job completed successfully") + Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace, jobName, timeout)).To(Succeed()) +} + +// waitForStatefulSetReplicas polls until the StatefulSet has the expected replica count. +func waitForStatefulSetReplicas(namespace, clusterName string, expectedReplicas int32) { + Eventually(func() int32 { + replicas, err := framework.GetStatefulSetReplicas(ctx, k8sClientset, namespace, clusterName) + if err != nil { + return -1 + } + return replicas + }, scaleAckTimeout, scalePollInterval).Should(Equal(expectedReplicas), + fmt.Sprintf("StatefulSet should have %d replicas", expectedReplicas)) +} diff --git a/test/chaos/k6.Dockerfile b/test/chaos/k6.Dockerfile new file mode 100644 index 0000000..8e6923b --- /dev/null +++ b/test/chaos/k6.Dockerfile @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +# +# SPDX-License-Identifier: Apache-2.0 + +FROM golang:1.25-alpine AS builder + +# install git and basic build tools so xk6 can fetch & build extensions +RUN apk add --no-cache git build-base ca-certificates + +RUN go install go.k6.io/xk6/cmd/xk6@latest +RUN xk6 build \ + --with github.com/grafana/xk6-redis \ + --output /k6 + +FROM alpine:3.23 +COPY --from=builder /k6 /usr/bin/k6 +COPY k6scripts/ /scripts/ +ENTRYPOINT ["/usr/bin/k6"] diff --git a/test/chaos/k6scripts/test-300k.js b/test/chaos/k6scripts/test-300k.js new file mode 100644 index 0000000..6f73c58 --- /dev/null +++ b/test/chaos/k6scripts/test-300k.js @@ -0,0 +1,57 @@ +/* + Test that imitates a redis-cluster that was causing rebalancing errors: + - Value Size: 1Byte - 300KBytes + - Timeout: 30s + - Some deletes + */ + +import redis from 'k6/x/redis'; +import { randomBytes } from 'k6/crypto'; +import { sleep } from 'k6'; + +const client = new redis.Client({ + cluster: { + nodes: __ENV.REDIS_HOSTS.split(',').map(node => `redis://${node}`), + }, +}); + +// Helper function to generate random-sized values +function generateRandomValue(maxBytes) { + const size = Math.floor(Math.random() * maxBytes) + 1; // Random size from 1 to maxBytes + return randomBytes(size).toString('base64'); +} + +export default function () { + const uniqueKey = `mykey_${__VU}_${__ITER}`; + const value = generateRandomValue(300000); + + // Set the key with a TTL of 30 seconds + client.set(uniqueKey, value, 30); + + // Randomly delete approximately 1 in 10 keys + if (Math.random() < 0.1) { + client.del(uniqueKey).then((deleted) => { + if (deleted === 1) { + console.log(`Key "${uniqueKey}" deleted.`); + } else { + console.warn(`Failed to delete key "${uniqueKey}".`); + } + }).catch((error) => { + console.error(`Error deleting key "${uniqueKey}": ${error}`); + }); + } else { + // Retrieve the value for the key + client.get(uniqueKey).then((retrievedValue) => { + if (retrievedValue === null) { + console.error(`Key "${uniqueKey}" not found!`); + } else { + console.log(`Retrieved value for key "${uniqueKey}": Length=${retrievedValue.length}`); + } + }).catch((error) => { + console.error(`Error retrieving key "${uniqueKey}": ${error}`); + }); + } + + // Sleep for a short random duration to simulate real-world load patterns + sleep(Math.random() * 0.1); // Sleep for up to 100ms +} diff --git a/test/chaos/suite_test.go b/test/chaos/suite_test.go new file mode 100644 index 0000000..db7f297 --- /dev/null +++ b/test/chaos/suite_test.go @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL, S.A. (INDITEX, S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + +package chaos + +import ( + "context" + "os" + "path/filepath" + "strconv" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +var ( + k8sClientset kubernetes.Interface + dynamicClient dynamic.Interface + ctx context.Context + cancel context.CancelFunc + chaosDuration time.Duration + chaosSeed int64 + chaosReadyTimeout = 10 * time.Minute +) + +func TestChaos(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Redkey Operator Chaos Test Suite", Label("chaos")) +} + +// SynchronizedBeforeSuite ensures cluster-level setup runs once across all +// parallel Ginkgo processes. The first process (process 1) performs the +// one-time setup, and all processes then create their own Kubernetes client. +var _ = SynchronizedBeforeSuite( + func() []byte { + By("verifying CRD directory exists (process 1)") + crdDir := filepath.Join("..", "..", "deployment") + _, err := os.Stat(crdDir) + Expect(err).NotTo(HaveOccurred(), "CRD directory %q must exist", crdDir) + + return nil + }, + func(_ []byte) { + By("creating Kubernetes clients") + + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + home, _ := os.UserHomeDir() + kubeconfig = filepath.Join(home, ".kube", "config") + } + + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + Expect(err).NotTo(HaveOccurred(), "failed to load kubeconfig from %s", kubeconfig) + Expect(cfg).NotTo(BeNil()) + + // Create native kubernetes clientset + k8sClientset, err = kubernetes.NewForConfig(cfg) + Expect(err).NotTo(HaveOccurred(), "failed to create Kubernetes clientset") + Expect(k8sClientset).NotTo(BeNil()) + + // Create dynamic client for CRD access + dynamicClient, err = dynamic.NewForConfig(cfg) + Expect(err).NotTo(HaveOccurred(), "failed to create dynamic client") + Expect(dynamicClient).NotTo(BeNil()) + + ctx, cancel = context.WithCancel(context.Background()) + + chaosDuration = parseDuration(os.Getenv("CHAOS_DURATION"), 10*time.Minute) + + if seedStr := os.Getenv("CHAOS_SEED"); seedStr != "" { + seed, err := strconv.ParseInt(seedStr, 10, 64) + if err == nil { + chaosSeed = seed + } else { + chaosSeed = GinkgoRandomSeed() + } + } else { + chaosSeed = GinkgoRandomSeed() + } + + GinkgoWriter.Printf("Chaos test configuration: duration=%v, seed=%d\n", chaosDuration, chaosSeed) + }, +) + +// SynchronizedAfterSuite ensures cleanup runs safely across all parallel processes. +var _ = SynchronizedAfterSuite( + func() { + By("cleaning up test context") + if cancel != nil { + cancel() + } + }, + func() { + By("final cleanup complete") + }, +) + +// parseDuration parses a duration string and returns a default if parsing fails. +func parseDuration(s string, defaultVal time.Duration) time.Duration { + if s == "" { + return defaultVal + } + d, err := time.ParseDuration(s) + if err != nil { + return defaultVal + } + return d +} From ce6be1b282cee579d48c8d415c491cb6754590ad Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 11 Mar 2026 14:00:54 +0100 Subject: [PATCH 02/20] test: fix deprecated naming in chaos --- Makefile | 2 +- .../redkey.inditex.dev_redkeyclusters.yaml | 3 --- test/chaos/chaos_suite_test.go | 24 ++++++++--------- test/chaos/framework/cluster.go | 4 +-- test/chaos/framework/crd.go | 4 +-- test/chaos/framework/operator.go | 2 +- test/chaos/framework/operator_setup.go | 26 +++++++++---------- 7 files changed, 31 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index c4eb169..db189cf 100644 --- a/Makefile +++ b/Makefile @@ -575,7 +575,7 @@ GINKGO_ENV ?= GOMAXPROCS=$(GOMAXPROCS) \ GINKGO_PACKAGES ?= ./test/e2e -.PHONY: test-e2e +.PHONY: install test-e2e test-e2e: process-manifests-crd ginkgo ## Execute e2e application test $(info $(M) running e2e tests...) @mkdir -p $(dir $(TEST_E2E_OUTPUT)) diff --git a/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml b/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml index a709b2e..36d269a 100644 --- a/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml +++ b/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml @@ -1,6 +1,3 @@ -# SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL S.A. (INDITEX S.A.) -# SPDX-License-Identifier: Apache-2.0 - --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index b0e0c36..bba3c85 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -13,8 +13,8 @@ import ( . "github.com/onsi/gomega" "github.com/inditextech/redkeyoperator/test/chaos/framework" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) const ( @@ -22,15 +22,15 @@ const ( defaultPrimaries = 5 // Chaos timing constants - chaosIterationDelay = 5 * time.Second // Delay between chaos iterations - chaosRateLimitDelay = 10 * time.Second // Delay for rate limiting between heavy operations - chaosReserveTime = 1 * time.Minute // Time reserved at end of chaos for final checks - k6CompletionBuffer = 5 * time.Minute // Buffer time for k6 job completion - operatorReadyTimeout = 2 * time.Minute // Timeout for operator to become ready - operatorPollInterval = 5 * time.Second // Poll interval for operator readiness - scaleAckTimeout = 30 * time.Second // Timeout for StatefulSet to acknowledge scale - scalePollInterval = 2 * time.Second // Poll interval for scale acknowledgment - diagnosticsLogTail = int64(100) // Number of log lines to capture for diagnostics + chaosIterationDelay = 5 * time.Second // Delay between chaos iterations + chaosRateLimitDelay = 10 * time.Second // Delay for rate limiting between heavy operations + chaosReserveTime = 1 * time.Minute // Time reserved at end of chaos for final checks + k6CompletionBuffer = 5 * time.Minute // Buffer time for k6 job completion + operatorReadyTimeout = 2 * time.Minute // Timeout for operator to become ready + operatorPollInterval = 5 * time.Second // Poll interval for operator readiness + scaleAckTimeout = 30 * time.Second // Timeout for StatefulSet to acknowledge scale + scalePollInterval = 2 * time.Second // Poll interval for scale acknowledgment + diagnosticsLogTail = int64(100) // Number of log lines to capture for diagnostics // Scaling bounds minPrimaries = 3 @@ -58,7 +58,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { Expect(framework.EnsureOperatorSetup(ctx, k8sClientset, namespace.Name)).To(Succeed()) Eventually(func() bool { - dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redis-operator", metav1.GetOptions{}) + dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redkey-operator", metav1.GetOptions{}) return err == nil && dep.Status.AvailableReplicas >= 1 }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) @@ -268,7 +268,7 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun Expect(framework.EnsureOperatorSetup(ctx, k8sClientset, namespace.Name)).To(Succeed()) Eventually(func() bool { - dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redis-operator", metav1.GetOptions{}) + dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redkey-operator", metav1.GetOptions{}) return err == nil && dep.Status.AvailableReplicas >= 1 }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) diff --git a/test/chaos/framework/cluster.go b/test/chaos/framework/cluster.go index 754df20..feacc7f 100644 --- a/test/chaos/framework/cluster.go +++ b/test/chaos/framework/cluster.go @@ -146,8 +146,8 @@ func buildRedkeyCluster( // Robin configuration robinImage := GetRobinImage() rc.Spec.Robin = &redkeyv1.RobinSpec{ - Template: &corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ + Template: &redkeyv1.PartialPodTemplateSpec{ + Spec: redkeyv1.PartialPodSpec{ Containers: []corev1.Container{ { Name: "robin", diff --git a/test/chaos/framework/crd.go b/test/chaos/framework/crd.go index aeab143..82b10a2 100644 --- a/test/chaos/framework/crd.go +++ b/test/chaos/framework/crd.go @@ -20,7 +20,7 @@ import ( // RedkeyClusterGVR is the GroupVersionResource for RedkeyCluster CRD. var RedkeyClusterGVR = schema.GroupVersionResource{ - Group: "redis.inditex.dev", + Group: "redkey.inditex.dev", Version: "v1", Resource: "redkeyclusters", } @@ -92,7 +92,7 @@ func ScaleRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, na func redkeyClusterToUnstructured(rc *redkeyv1.RedkeyCluster) (*unstructured.Unstructured, error) { // Ensure TypeMeta is set rc.TypeMeta = metav1.TypeMeta{ - APIVersion: "redis.inditex.dev/v1", + APIVersion: "redkey.inditex.dev/v1", Kind: "RedkeyCluster", } diff --git a/test/chaos/framework/operator.go b/test/chaos/framework/operator.go index ca60b7f..6b59875 100644 --- a/test/chaos/framework/operator.go +++ b/test/chaos/framework/operator.go @@ -19,7 +19,7 @@ import ( ) const ( - operatorDeploymentName = "redis-operator" + operatorDeploymentName = "redkey-operator" operatorScaleTimeout = 2 * time.Minute defaultOperatorImage = "localhost:5001/redkey-operator:dev" diff --git a/test/chaos/framework/operator_setup.go b/test/chaos/framework/operator_setup.go index 2538ad8..9fecfb7 100644 --- a/test/chaos/framework/operator_setup.go +++ b/test/chaos/framework/operator_setup.go @@ -31,16 +31,16 @@ func EnsureOperatorSetup(ctx context.Context, clientset kubernetes.Interface, na if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "leader-election-role", leaderElectionPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { return fmt.Errorf("ensure leader-election-role: %w", err) } - if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "redis-operator-role", operatorPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("ensure redis-operator-role: %w", err) + if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "redkey-operator-role", operatorPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure redkey-operator-role: %w", err) } // Create RoleBindings if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "leader-election-rolebinding", "leader-election-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { return fmt.Errorf("ensure leader-election-rolebinding: %w", err) } - if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "redis-operator-rolebinding", "redis-operator-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { - return fmt.Errorf("ensure redis-operator-rolebinding: %w", err) + if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "redkey-operator-rolebinding", "redkey-operator-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("ensure redkey-operator-rolebinding: %w", err) } // Create ConfigMap @@ -59,7 +59,7 @@ func EnsureOperatorSetup(ctx context.Context, clientset kubernetes.Interface, na func newServiceAccount(ns string) *corev1.ServiceAccount { return &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ - Name: "redis-operator-sa", + Name: "redkey-operator-sa", Namespace: ns, }, } @@ -76,13 +76,13 @@ func newRoleBinding(ns, name, roleName string) *rbacv1.RoleBinding { return &rbacv1.RoleBinding{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: ns}, RoleRef: rbacv1.RoleRef{APIGroup: "rbac.authorization.k8s.io", Kind: "Role", Name: roleName}, - Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: "redis-operator-sa", Namespace: ns}}, + Subjects: []rbacv1.Subject{{Kind: "ServiceAccount", Name: "redkey-operator-sa", Namespace: ns}}, } } func newConfigMap(ns string) *corev1.ConfigMap { return &corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{Name: "redis-operator-config", Namespace: ns}, + ObjectMeta: metav1.ObjectMeta{Name: "redkey-operator-config", Namespace: ns}, Data: map[string]string{ "redis_operator_config.yaml": `apiVersion: controller-runtime.sigs.k8s.io/v1alpha1 kind: ControllerManagerConfig @@ -102,7 +102,7 @@ func newOperatorDeployment(ns string) *appsv1.Deployment { replicas := int32(1) return &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: "redis-operator", + Name: "redkey-operator", Namespace: ns, Labels: map[string]string{"control-plane": "redkey-operator"}, }, @@ -122,7 +122,7 @@ func newOperatorDeployment(ns string) *appsv1.Deployment { }, }, Spec: corev1.PodSpec{ - ServiceAccountName: "redis-operator-sa", + ServiceAccountName: "redkey-operator-sa", SecurityContext: &corev1.PodSecurityContext{RunAsNonRoot: ptr.To(true)}, TerminationGracePeriodSeconds: ptr.To(int64(10)), Containers: []corev1.Container{newOperatorContainer(ns)}, @@ -134,7 +134,7 @@ func newOperatorDeployment(ns string) *appsv1.Deployment { func newOperatorContainer(ns string) corev1.Container { return corev1.Container{ - Name: "redis-operator", + Name: "redkey-operator", Image: GetOperatorImage(), Command: []string{"/manager"}, Args: []string{"--leader-elect", "--max-concurrent-reconciles", "10"}, @@ -206,17 +206,17 @@ func operatorPolicyRules() []rbacv1.PolicyRule { Verbs: []string{"create", "delete", "get", "list", "patch", "update", "watch"}, }, { - APIGroups: []string{"redis.inditex.dev"}, + APIGroups: []string{"redkey.inditex.dev"}, Resources: []string{"redkeyclusters"}, Verbs: []string{"create", "delete", "get", "list", "patch", "update", "watch"}, }, { - APIGroups: []string{"redis.inditex.dev"}, + APIGroups: []string{"redkey.inditex.dev"}, Resources: []string{"redkeyclusters/finalizers"}, Verbs: []string{"update"}, }, { - APIGroups: []string{"redis.inditex.dev"}, + APIGroups: []string{"redkey.inditex.dev"}, Resources: []string{"redkeyclusters/status"}, Verbs: []string{"get", "patch", "update"}, }, From fc19b9e6e3cf53da036a7dcc802b838e444758e5 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 11 Mar 2026 14:13:42 +0100 Subject: [PATCH 03/20] test: fix eliminate false positives in chaos tests --- test/chaos/chaos_suite_test.go | 19 ++++++--- test/chaos/framework/k6.go | 30 ++++++++++++- test/chaos/framework/redis_chaos.go | 10 ++++- test/chaos/k6scripts/test-300k.js | 65 +++++++++++++++++++---------- 4 files changed, 92 insertions(+), 32 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index bba3c85..60c2d59 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -110,6 +110,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { deleteCount := rng.Intn(int(newSize)/2) + 1 deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, deleteCount, rng) Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") GinkgoWriter.Printf("Deleted pods: %v\n", deleted) By(fmt.Sprintf("iteration %d: waiting for cluster recovery", iteration)) @@ -152,8 +153,9 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - _, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) @@ -185,11 +187,14 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - _, _ = framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + deletedRobin, err := framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deletedRobin).NotTo(BeEmpty(), "expected at least one robin pod deletion") By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - _, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred()) + Expect(deletedRedis).NotTo(BeEmpty(), "expected at least one redis pod deletion") By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) @@ -228,10 +233,14 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) case 1: By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - _, _ = framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + deleted, err := framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one robin pod deletion") case 2: By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - _, _ = framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") case 3: By(fmt.Sprintf("iteration %d: scaling cluster", iteration)) newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) diff --git a/test/chaos/framework/k6.go b/test/chaos/framework/k6.go index 4bafa27..9a59414 100644 --- a/test/chaos/framework/k6.go +++ b/test/chaos/framework/k6.go @@ -9,6 +9,7 @@ import ( "fmt" "io" "os" + "regexp" "strings" "time" @@ -27,8 +28,11 @@ const ( k6StartupTimeout = 2 * time.Minute defaultK6VUs = 10 k6ScriptConfigMap = "k6-scripts" + k6LogTailLines = int64(200) ) +var k6ErrorPattern = regexp.MustCompile(`(?m)\[K6_ERROR\]`) + // GetK6Image returns the k6 image from environment or default. func GetK6Image() string { if img := os.Getenv("K6_IMG"); img != "" { @@ -146,10 +150,21 @@ func WaitForK6JobCompletion(ctx context.Context, clientset kubernetes.Interface, for _, condition := range job.Status.Conditions { if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { + logs, logsErr := GetK6JobLogs(ctx, clientset, namespace, jobName) + if logsErr != nil { + return false, fmt.Errorf("k6 job completed but logs could not be inspected: %w", logsErr) + } + if k6ErrorPattern.MatchString(logs) { + return false, fmt.Errorf("k6 job completed with application errors: %s", summarizeK6Logs(logs)) + } return true, nil } if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { - return false, fmt.Errorf("k6 job failed: %s", condition.Message) + logs, logsErr := GetK6JobLogs(ctx, clientset, namespace, jobName) + if logsErr != nil { + return false, fmt.Errorf("k6 job failed: %s (log inspection failed: %v)", condition.Message, logsErr) + } + return false, fmt.Errorf("k6 job failed: %s; logs: %s", condition.Message, summarizeK6Logs(logs)) } } @@ -198,7 +213,7 @@ func GetK6JobLogs(ctx context.Context, clientset kubernetes.Interface, namespace // Get logs from the first pod using proper log API pod := pods.Items[0] - tailLines := int64(1000) + tailLines := k6LogTailLines opts := &corev1.PodLogOptions{ TailLines: &tailLines, } @@ -259,3 +274,14 @@ func formatDuration(d time.Duration) string { } return fmt.Sprintf("%dh%dm", hours, minutes) } + +func summarizeK6Logs(logs string) string { + trimmed := strings.TrimSpace(logs) + if trimmed == "" { + return "" + } + if len(trimmed) > 1500 { + return trimmed[len(trimmed)-1500:] + } + return trimmed +} diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go index 698906c..9ef8972 100644 --- a/test/chaos/framework/redis_chaos.go +++ b/test/chaos/framework/redis_chaos.go @@ -170,7 +170,10 @@ func CorruptSlotOwnership(ctx context.Context, clientset kubernetes.Interface, n // Delete slot from all nodes for _, pod := range pods.Items { - _, _, _ = RemoteCommand(ctx, namespace, pod.Name, fmt.Sprintf("redis-cli cluster delslots %d", slot)) + stdout, stderr, err := RemoteCommand(ctx, namespace, pod.Name, fmt.Sprintf("redis-cli cluster delslots %d", slot)) + if err != nil { + return fmt.Errorf("failed to delslots %d on %s: %w (stdout=%q stderr=%q)", slot, pod.Name, err, trimNewline(stdout), trimNewline(stderr)) + } } // Assign slot to different nodes inconsistently (first two nodes) @@ -255,7 +258,10 @@ func ForcePrimaryToReplica(ctx context.Context, clientset kubernetes.Interface, } // Delete all slots from the pod - _, _, _ = RemoteCommand(ctx, namespace, podName, "redis-cli cluster flushslots") + stdout, stderr, err := RemoteCommand(ctx, namespace, podName, "redis-cli cluster flushslots") + if err != nil { + return fmt.Errorf("failed to flushslots on %s: %w (stdout=%q stderr=%q)", podName, err, trimNewline(stdout), trimNewline(stderr)) + } // Make it replicate the target _, _, err = RemoteCommand(ctx, namespace, podName, fmt.Sprintf("redis-cli cluster replicate %s", targetNodeID)) diff --git a/test/chaos/k6scripts/test-300k.js b/test/chaos/k6scripts/test-300k.js index 6f73c58..54751a4 100644 --- a/test/chaos/k6scripts/test-300k.js +++ b/test/chaos/k6scripts/test-300k.js @@ -7,7 +7,13 @@ import redis from 'k6/x/redis'; import { randomBytes } from 'k6/crypto'; -import { sleep } from 'k6'; +import { check, sleep } from 'k6'; + +export const options = { + thresholds: { + checks: ['rate>0.99'], + }, +}; const client = new redis.Client({ cluster: { @@ -25,31 +31,44 @@ export default function () { const uniqueKey = `mykey_${__VU}_${__ITER}`; const value = generateRandomValue(300000); - // Set the key with a TTL of 30 seconds - client.set(uniqueKey, value, 30); + try { + const setResult = client.set(uniqueKey, value, 30); + const setOk = check(setResult, { + 'redis set succeeds': (result) => result === 'OK', + }); + if (!setOk) { + const message = `[K6_ERROR] set failed for ${uniqueKey}`; + console.error(message); + throw new Error(message); + } - // Randomly delete approximately 1 in 10 keys - if (Math.random() < 0.1) { - client.del(uniqueKey).then((deleted) => { - if (deleted === 1) { - console.log(`Key "${uniqueKey}" deleted.`); - } else { - console.warn(`Failed to delete key "${uniqueKey}".`); + // Randomly delete approximately 1 in 10 keys + if (Math.random() < 0.1) { + const deleted = client.del(uniqueKey); + const deleteOk = check(deleted, { + 'redis delete succeeds': (result) => result === 1, + }); + if (!deleteOk) { + const message = `[K6_ERROR] delete failed for ${uniqueKey}; result=${deleted}`; + console.error(message); + throw new Error(message); } - }).catch((error) => { - console.error(`Error deleting key "${uniqueKey}": ${error}`); - }); - } else { - // Retrieve the value for the key - client.get(uniqueKey).then((retrievedValue) => { - if (retrievedValue === null) { - console.error(`Key "${uniqueKey}" not found!`); - } else { - console.log(`Retrieved value for key "${uniqueKey}": Length=${retrievedValue.length}`); + } else { + const retrievedValue = client.get(uniqueKey); + const getOk = check(retrievedValue, { + 'redis get returns original value': (result) => result === value, + }); + if (!getOk) { + const resultLength = retrievedValue ? retrievedValue.length : 'null'; + const message = `[K6_ERROR] get failed for ${uniqueKey}; length=${resultLength}`; + console.error(message); + throw new Error(message); } - }).catch((error) => { - console.error(`Error retrieving key "${uniqueKey}": ${error}`); - }); + } + } catch (error) { + const message = `[K6_ERROR] iteration failed for ${uniqueKey}: ${error}`; + console.error(message); + throw error; } // Sleep for a short random duration to simulate real-world load patterns From 7958c1aebf307b59f9bf26ee5afd49da3384cb01 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 11 Mar 2026 16:40:05 +0100 Subject: [PATCH 04/20] test: strength chaos tests --- test/chaos/chaos_suite_test.go | 4 +-- test/chaos/framework/operator.go | 48 +++++++++++++---------------- test/chaos/framework/readiness.go | 4 +-- test/chaos/framework/redis_chaos.go | 13 +++++++- 4 files changed, 38 insertions(+), 31 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 60c2d59..b363696 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -150,7 +150,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace.Name)).To(Succeed()) By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) @@ -230,7 +230,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { switch action { case 0: By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPod(ctx, k8sClientset, namespace.Name)).To(Succeed()) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace.Name)).To(Succeed()) case 1: By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) deleted, err := framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) diff --git a/test/chaos/framework/operator.go b/test/chaos/framework/operator.go index 6b59875..d177f65 100644 --- a/test/chaos/framework/operator.go +++ b/test/chaos/framework/operator.go @@ -10,7 +10,7 @@ import ( "os" "time" - corev1 "k8s.io/api/core/v1" + appsv1 "k8s.io/api/apps/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -43,8 +43,9 @@ func ScaleOperatorUp(ctx context.Context, clientset kubernetes.Interface, namesp return scaleDeploymentNative(ctx, clientset, namespace, operatorDeploymentName, 1) } -// DeleteOperatorPod deletes the operator pod (the deployment will recreate it). -func DeleteOperatorPod(ctx context.Context, clientset kubernetes.Interface, namespace string) error { +// DeleteOperatorPods requests deletion of the current operator pods. +// It only verifies that the Kubernetes delete calls succeed. +func DeleteOperatorPods(ctx context.Context, clientset kubernetes.Interface, namespace string) error { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: OperatorPodsSelector(), }) @@ -62,22 +63,7 @@ func DeleteOperatorPod(ctx context.Context, clientset kubernetes.Interface, name } } - // Wait for at least one operator pod to be ready again - return wait.PollUntilContextTimeout(ctx, 2*time.Second, operatorScaleTimeout, true, func(ctx context.Context) (bool, error) { - pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: OperatorPodsSelector(), - }) - if err != nil { - return false, nil - } - - for _, pod := range pods.Items { - if pod.Status.Phase == corev1.PodRunning && isPodReady(&pod) { - return true, nil - } - } - return false, nil - }) + return nil } // scaleDeploymentNative scales a deployment to the desired replica count using native client-go. @@ -108,7 +94,18 @@ func waitForDeploymentScaleDownNative(ctx context.Context, clientset kubernetes. } return false, nil } - return dep.Status.Replicas == 0, nil + + selector, err := deploymentPodSelector(dep) + if err != nil { + return false, err + } + + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: selector}) + if err != nil { + return false, nil + } + + return dep.Status.Replicas == 0 && dep.Status.ReadyReplicas == 0 && len(pods.Items) == 0, nil }) } @@ -124,11 +121,10 @@ func waitForDeploymentReadyNative(ctx context.Context, clientset kubernetes.Inte } // isPodReady returns true if all containers in the pod are ready. -func isPodReady(pod *corev1.Pod) bool { - for _, condition := range pod.Status.Conditions { - if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { - return true - } +func deploymentPodSelector(dep *appsv1.Deployment) (string, error) { + selector, err := metav1.LabelSelectorAsSelector(dep.Spec.Selector) + if err != nil { + return "", fmt.Errorf("deployment %s has invalid selector: %w", dep.Name, err) } - return false + return selector.String(), nil } diff --git a/test/chaos/framework/readiness.go b/test/chaos/framework/readiness.go index 83365ed..bb11378 100644 --- a/test/chaos/framework/readiness.go +++ b/test/chaos/framework/readiness.go @@ -92,13 +92,13 @@ func clusterCheckPasses(ctx context.Context, namespace, podName string) bool { return !strings.Contains(stdout, "[ERR]") } -// clusterNodesHasFailure checks if any node is in fail state or has migrating slots. +// clusterNodesHasFailure checks if any node is in fail state or has migrating/importing slots. func clusterNodesHasFailure(ctx context.Context, namespace, podName string) bool { stdout, _, err := RemoteCommand(ctx, namespace, podName, "redis-cli cluster nodes") if err != nil { return true } - return strings.Contains(stdout, "fail") || strings.Contains(stdout, "->") + return strings.Contains(stdout, "fail") || strings.Contains(stdout, "->") || strings.Contains(stdout, "<-") } // AssertAllSlotsAssigned verifies that all 16384 slots are assigned. diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go index 9ef8972..83b1949 100644 --- a/test/chaos/framework/redis_chaos.go +++ b/test/chaos/framework/redis_chaos.go @@ -131,7 +131,18 @@ func scaleRobinDeploymentNative(ctx context.Context, clientset kubernetes.Interf } return false, nil } - return dep.Status.Replicas == 0, nil + + selector, err := deploymentPodSelector(dep) + if err != nil { + return false, err + } + + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{LabelSelector: selector}) + if err != nil { + return false, nil + } + + return dep.Status.Replicas == 0 && dep.Status.ReadyReplicas == 0 && len(pods.Items) == 0, nil }) } From ee99a86002b9f6743479c43bd6b31fbbde6ce1ec Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 11 Mar 2026 17:24:34 +0100 Subject: [PATCH 05/20] test: chaos test resilence --- test/chaos/chaos_suite_test.go | 21 +++-- test/chaos/framework/k6.go | 32 ++++++-- test/chaos/framework/operator_setup.go | 101 +++++++++++++++++++++---- test/chaos/helpers_test.go | 4 +- 4 files changed, 127 insertions(+), 31 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index b363696..c9c874e 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -70,13 +70,19 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { }) AfterEach(func() { - if CurrentSpecReport().Failed() { + namespaceName := "" + if namespace != nil { + namespaceName = namespace.Name + } + + if CurrentSpecReport().Failed() && namespaceName != "" { collectDiagnostics(namespace.Name) } if k6JobName != "" { - _ = framework.DeleteK6Job(ctx, k8sClientset, namespace.Name, k6JobName) + Expect(namespaceName).NotTo(BeEmpty(), "k6 job cleanup requires a namespace") + Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) } - _ = framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace) + Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) }) // ================================================================================== @@ -289,10 +295,15 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun }) AfterEach(func() { - if CurrentSpecReport().Failed() { + namespaceName := "" + if namespace != nil { + namespaceName = namespace.Name + } + + if CurrentSpecReport().Failed() && namespaceName != "" { collectDiagnostics(namespace.Name) } - _ = framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace) + Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) }) // ================================================================================== diff --git a/test/chaos/framework/k6.go b/test/chaos/framework/k6.go index 9a59414..4544608 100644 --- a/test/chaos/framework/k6.go +++ b/test/chaos/framework/k6.go @@ -23,12 +23,11 @@ import ( ) const ( - defaultK6Image = "localhost:5001/redkey-k6:dev" - k6JobTimeout = 30 * time.Minute - k6StartupTimeout = 2 * time.Minute - defaultK6VUs = 10 - k6ScriptConfigMap = "k6-scripts" - k6LogTailLines = int64(200) + defaultK6Image = "localhost:5001/redkey-k6:dev" + k6JobTimeout = 30 * time.Minute + k6StartupTimeout = 2 * time.Minute + defaultK6VUs = 10 + k6LogTailLines = int64(200) ) var k6ErrorPattern = regexp.MustCompile(`(?m)\[K6_ERROR\]`) @@ -185,7 +184,20 @@ func DeleteK6Job(ctx context.Context, clientset kubernetes.Interface, namespace, if errors.IsNotFound(err) { return nil } - return err + if err != nil { + return err + } + + return wait.PollUntilContextTimeout(ctx, time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { + _, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) + if errors.IsNotFound(err) { + return true, nil + } + if err != nil { + return false, err + } + return false, nil + }) } // GetK6JobLogs returns the logs from the k6 job pod. @@ -232,7 +244,7 @@ func GetK6JobLogs(ctx context.Context, clientset kubernetes.Interface, namespace return buf.String(), nil } -// getRedisHosts returns a comma-separated list of redis host:port for k6. +// getRedisHosts returns stable redis host:port endpoints for k6. func getRedisHosts(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) (string, error) { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("redkey-cluster-name=%s,redis.redkeycluster.operator/component=redis", clusterName), @@ -247,6 +259,10 @@ func getRedisHosts(ctx context.Context, clientset kubernetes.Interface, namespac var hosts []string for _, pod := range pods.Items { + if pod.Name != "" { + hosts = append(hosts, fmt.Sprintf("%s.%s.%s.svc:6379", pod.Name, clusterName, namespace)) + continue + } if pod.Status.PodIP != "" { hosts = append(hosts, fmt.Sprintf("%s:6379", pod.Status.PodIP)) } diff --git a/test/chaos/framework/operator_setup.go b/test/chaos/framework/operator_setup.go index 9fecfb7..26a8a54 100644 --- a/test/chaos/framework/operator_setup.go +++ b/test/chaos/framework/operator_setup.go @@ -22,40 +22,109 @@ import ( // EnsureOperatorSetup creates the ServiceAccount, RBAC, ConfigMap and Deployment for the operator. func EnsureOperatorSetup(ctx context.Context, clientset kubernetes.Interface, namespace string) error { - // Create ServiceAccount - if _, err := clientset.CoreV1().ServiceAccounts(namespace).Create(ctx, newServiceAccount(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureServiceAccount(ctx, clientset, newServiceAccount(namespace)); err != nil { return fmt.Errorf("ensure ServiceAccount: %w", err) } - - // Create Roles - if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "leader-election-role", leaderElectionPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureRole(ctx, clientset, newRole(namespace, "leader-election-role", leaderElectionPolicyRules())); err != nil { return fmt.Errorf("ensure leader-election-role: %w", err) } - if _, err := clientset.RbacV1().Roles(namespace).Create(ctx, newRole(namespace, "redkey-operator-role", operatorPolicyRules()), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureRole(ctx, clientset, newRole(namespace, "redkey-operator-role", operatorPolicyRules())); err != nil { return fmt.Errorf("ensure redkey-operator-role: %w", err) } - - // Create RoleBindings - if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "leader-election-rolebinding", "leader-election-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureRoleBinding(ctx, clientset, newRoleBinding(namespace, "leader-election-rolebinding", "leader-election-role")); err != nil { return fmt.Errorf("ensure leader-election-rolebinding: %w", err) } - if _, err := clientset.RbacV1().RoleBindings(namespace).Create(ctx, newRoleBinding(namespace, "redkey-operator-rolebinding", "redkey-operator-role"), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureRoleBinding(ctx, clientset, newRoleBinding(namespace, "redkey-operator-rolebinding", "redkey-operator-role")); err != nil { return fmt.Errorf("ensure redkey-operator-rolebinding: %w", err) } - - // Create ConfigMap - if _, err := clientset.CoreV1().ConfigMaps(namespace).Create(ctx, newConfigMap(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureConfigMap(ctx, clientset, newConfigMap(namespace)); err != nil { return fmt.Errorf("ensure ConfigMap: %w", err) } - - // Create Deployment - if _, err := clientset.AppsV1().Deployments(namespace).Create(ctx, newOperatorDeployment(namespace), metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + if err := ensureDeployment(ctx, clientset, newOperatorDeployment(namespace)); err != nil { return fmt.Errorf("ensure Deployment: %w", err) } return nil } +func ensureServiceAccount(ctx context.Context, clientset kubernetes.Interface, desired *corev1.ServiceAccount) error { + existing, err := clientset.CoreV1().ServiceAccounts(desired.Namespace).Get(ctx, desired.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err = clientset.CoreV1().ServiceAccounts(desired.Namespace).Create(ctx, desired, metav1.CreateOptions{}) + return err + } + return err + } + + desired.ResourceVersion = existing.ResourceVersion + desired.Secrets = existing.Secrets + desired.ImagePullSecrets = existing.ImagePullSecrets + desired.AutomountServiceAccountToken = existing.AutomountServiceAccountToken + _, err = clientset.CoreV1().ServiceAccounts(desired.Namespace).Update(ctx, desired, metav1.UpdateOptions{}) + return err +} + +func ensureRole(ctx context.Context, clientset kubernetes.Interface, desired *rbacv1.Role) error { + existing, err := clientset.RbacV1().Roles(desired.Namespace).Get(ctx, desired.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err = clientset.RbacV1().Roles(desired.Namespace).Create(ctx, desired, metav1.CreateOptions{}) + return err + } + return err + } + + desired.ResourceVersion = existing.ResourceVersion + _, err = clientset.RbacV1().Roles(desired.Namespace).Update(ctx, desired, metav1.UpdateOptions{}) + return err +} + +func ensureRoleBinding(ctx context.Context, clientset kubernetes.Interface, desired *rbacv1.RoleBinding) error { + existing, err := clientset.RbacV1().RoleBindings(desired.Namespace).Get(ctx, desired.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err = clientset.RbacV1().RoleBindings(desired.Namespace).Create(ctx, desired, metav1.CreateOptions{}) + return err + } + return err + } + + desired.ResourceVersion = existing.ResourceVersion + _, err = clientset.RbacV1().RoleBindings(desired.Namespace).Update(ctx, desired, metav1.UpdateOptions{}) + return err +} + +func ensureConfigMap(ctx context.Context, clientset kubernetes.Interface, desired *corev1.ConfigMap) error { + existing, err := clientset.CoreV1().ConfigMaps(desired.Namespace).Get(ctx, desired.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err = clientset.CoreV1().ConfigMaps(desired.Namespace).Create(ctx, desired, metav1.CreateOptions{}) + return err + } + return err + } + + desired.ResourceVersion = existing.ResourceVersion + _, err = clientset.CoreV1().ConfigMaps(desired.Namespace).Update(ctx, desired, metav1.UpdateOptions{}) + return err +} + +func ensureDeployment(ctx context.Context, clientset kubernetes.Interface, desired *appsv1.Deployment) error { + existing, err := clientset.AppsV1().Deployments(desired.Namespace).Get(ctx, desired.Name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + _, err = clientset.AppsV1().Deployments(desired.Namespace).Create(ctx, desired, metav1.CreateOptions{}) + return err + } + return err + } + + desired.ResourceVersion = existing.ResourceVersion + _, err = clientset.AppsV1().Deployments(desired.Namespace).Update(ctx, desired, metav1.UpdateOptions{}) + return err +} + func newServiceAccount(ns string) *corev1.ServiceAccount { return &corev1.ServiceAccount{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index c45961e..812c7df 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -21,12 +21,12 @@ func startK6OrFail(namespace, clusterName string, duration time.Duration, vus in return jobName } -// cleanupK6Job safely deletes a k6 job, ignoring errors. +// cleanupK6Job deletes a k6 job and fails the spec if cleanup fails. func cleanupK6Job(namespace, jobName string) { if jobName == "" { return } - _ = framework.DeleteK6Job(ctx, k8sClientset, namespace, jobName) + Expect(framework.DeleteK6Job(ctx, k8sClientset, namespace, jobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", jobName, namespace) } // chaosLoop runs a chaos function repeatedly until the duration expires. From 20b12814f69886de0c824f63bc3ad4ceafee5a78 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Tue, 17 Mar 2026 09:13:28 +0100 Subject: [PATCH 06/20] ci: improving Makefile --- Makefile | 13 +------------ scripts/report-test.py | 9 ++++++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index db189cf..4fdde78 100644 --- a/Makefile +++ b/Makefile @@ -619,15 +619,4 @@ test-chaos: process-manifests-crd ginkgo k6-push ## Execute chaos tests --json-report=$(CHAOS_TEST_OUTPUT) \ $(GINKGO_OPTS) \ --procs=1 \ - $(CHAOS_PACKAGES) - -.PHONY: test-chaos-focus -test-chaos-focus: process-manifests-crd ginkgo k6-push ## Run specific chaos test by name - $(info $(M) running focused chaos test: $(FOCUS)) - $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_DURATION=$(CHAOS_DURATION) \ - $(if $(CHAOS_SEED),CHAOS_SEED=$(CHAOS_SEED),) \ - ginkgo \ - --timeout=$(CHAOS_TIMEOUT) \ - --focus="$(FOCUS)" \ - --procs=1 \ - $(CHAOS_PACKAGES) + $(GINKGO_OPTS) $(CHAOS_PACKAGES) diff --git a/scripts/report-test.py b/scripts/report-test.py index dc8f0c1..1b10063 100755 --- a/scripts/report-test.py +++ b/scripts/report-test.py @@ -35,10 +35,13 @@ def generate_md_report(data: dict) -> str: specs = suite["SpecReports"] passed = sum(1 for s in specs if s.get("State") == "passed" and s.get("LeafNodeType") == "It") failed = sum(1 for s in specs if s.get("State") == "failed") + skipped = sum(1 for s in specs if s.get("State") == "skipped" and s.get("LeafNodeType") == "It") total = suite["PreRunStats"]["TotalSpecs"] status = "PASS" if suite["SuiteSucceeded"] else "FAIL" lines.append(f"## {status}: {suite['SuiteDescription']} ({passed}/{total} passed)") + if skipped: + lines.append(f"- Skipped: {skipped}") lines.append("") for spec in specs: @@ -59,11 +62,11 @@ def generate_md_report(data: dict) -> str: # Build test name from hierarchy + text full_name = " > ".join(hierarchy + [text]) if hierarchy else text - status = "PASS" if state in ("passed", "skipped") else "FAIL" - - if status == "PASS": # We do not want PASS tests + if state == "passed": continue + status = "SKIP" if state == "skipped" else "FAIL" + lines.append(f"### {status}: {escape_md(full_name)}") lines.append("") From bdd58a564acdaa91631cd32ee1f50c817a49cc36 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Tue, 17 Mar 2026 09:35:43 +0100 Subject: [PATCH 07/20] ci: improving Makefile --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index 4fdde78..b175ae3 100644 --- a/Makefile +++ b/Makefile @@ -617,6 +617,4 @@ test-chaos: process-manifests-crd ginkgo k6-push ## Execute chaos tests ginkgo \ --timeout=$(CHAOS_TIMEOUT) \ --json-report=$(CHAOS_TEST_OUTPUT) \ - $(GINKGO_OPTS) \ - --procs=1 \ $(GINKGO_OPTS) $(CHAOS_PACKAGES) From 0a424b274426f28ac5aefd826f4933f29a003f3f Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Tue, 17 Mar 2026 14:31:54 +0100 Subject: [PATCH 08/20] test: improving testing --- test/chaos/chaos_suite_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index c9c874e..daf77d4 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -331,6 +331,7 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun By("waiting for cluster to heal") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) }) // ================================================================================== @@ -388,6 +389,7 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun By("waiting for cluster to heal") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) }) }) From 70e72bf65461cac06e968bc675cd9b27ab2973c9 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 18 Mar 2026 10:17:37 +0100 Subject: [PATCH 09/20] test: fixing chaos tests --- test/chaos/chaos_suite_test.go | 21 +++++++++++----- test/chaos/framework/k6.go | 15 +++--------- test/chaos/framework/redis_chaos.go | 38 ++++++++++++++++++++++++++--- test/chaos/k6scripts/test-300k.js | 8 +++--- 4 files changed, 58 insertions(+), 24 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index daf77d4..e379de3 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -86,7 +86,8 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { }) // ================================================================================== - // Scenario 1: Continuous Chaos Under Load + // Scenario 1: Continuous Scaling Under Load and Chaos (PurgeKeysOnRebalance=true) + // PurgeKeysOnRebalance=true --> the StatefulSet is recreated when scaling // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic", func() { By("starting k6 load job") @@ -106,11 +107,19 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) - // Poll for StatefulSet to acknowledge the scale instead of fixed sleep - Eventually(func() int32 { - replicas, _ := framework.GetStatefulSetReplicas(ctx, k8sClientset, namespace.Name, clusterName) - return replicas - }, scaleAckTimeout, scalePollInterval).Should(Equal(newSize)) + // Poll for StatefulSet to acknowledge the scale and pods to exist. + // During fast scaling (PurgeKeysOnRebalance=true), the operator deletes and + // recreates the StatefulSet, so we must wait for pods to actually exist + // before attempting to delete them. + Eventually(func() int { + pods, err := k8sClientset.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{ + LabelSelector: framework.RedisPodsSelector(clusterName), + }) + if err != nil { + return 0 + } + return len(pods.Items) + }, scaleAckTimeout, scalePollInterval).Should(BeNumerically(">=", int(newSize))) By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) deleteCount := rng.Intn(int(newSize)/2) + 1 diff --git a/test/chaos/framework/k6.go b/test/chaos/framework/k6.go index 4544608..49bf8a3 100644 --- a/test/chaos/framework/k6.go +++ b/test/chaos/framework/k6.go @@ -9,7 +9,6 @@ import ( "fmt" "io" "os" - "regexp" "strings" "time" @@ -30,8 +29,6 @@ const ( k6LogTailLines = int64(200) ) -var k6ErrorPattern = regexp.MustCompile(`(?m)\[K6_ERROR\]`) - // GetK6Image returns the k6 image from environment or default. func GetK6Image() string { if img := os.Getenv("K6_IMG"); img != "" { @@ -149,13 +146,9 @@ func WaitForK6JobCompletion(ctx context.Context, clientset kubernetes.Interface, for _, condition := range job.Status.Conditions { if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { - logs, logsErr := GetK6JobLogs(ctx, clientset, namespace, jobName) - if logsErr != nil { - return false, fmt.Errorf("k6 job completed but logs could not be inspected: %w", logsErr) - } - if k6ErrorPattern.MatchString(logs) { - return false, fmt.Errorf("k6 job completed with application errors: %s", summarizeK6Logs(logs)) - } + // k6 exited 0, meaning all thresholds (including checks rate>0.99) passed. + // Transient [K6_ERROR] entries are expected during chaos scenarios + // (pod deletions, scaling) and are already accounted for by the threshold. return true, nil } if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { @@ -260,7 +253,7 @@ func getRedisHosts(ctx context.Context, clientset kubernetes.Interface, namespac var hosts []string for _, pod := range pods.Items { if pod.Name != "" { - hosts = append(hosts, fmt.Sprintf("%s.%s.%s.svc:6379", pod.Name, clusterName, namespace)) + hosts = append(hosts, fmt.Sprintf("%s.%s.%s.svc.cluster.local:6379", pod.Name, clusterName, namespace)) continue } if pod.Status.PodIP != "" { diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go index 83b1949..0f090b4 100644 --- a/test/chaos/framework/redis_chaos.go +++ b/test/chaos/framework/redis_chaos.go @@ -8,6 +8,7 @@ import ( "context" "fmt" "math/rand" + "strings" "time" "k8s.io/apimachinery/pkg/api/errors" @@ -20,9 +21,25 @@ import ( ) const ( - robinScaleTimeout = 2 * time.Minute + robinScaleTimeout = 2 * time.Minute + scaleRetryTimeout = 2 * time.Minute + scaleRetryInterval = 3 * time.Second ) +// isNotReadyValidationError returns true when the Kubernetes API rejects a +// mutation because the cluster status is not 'Ready'. This is a transient +// condition during chaos: the cluster will eventually settle to Ready, at +// which point the scale request will be accepted. +func isNotReadyValidationError(err error) bool { + if err == nil { + return false + } + if !errors.IsInvalid(err) { + return false + } + return strings.Contains(err.Error(), "Changing the number of primaries is not allowed unless the cluster is in 'Ready' status") +} + // DeleteRandomRedisPods deletes N random redis pods from the cluster. func DeleteRandomRedisPods(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, count int, rng *rand.Rand) ([]string, error) { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ @@ -89,9 +106,24 @@ func DeleteRobinPods(ctx context.Context, clientset kubernetes.Interface, namesp } // ScaleCluster scales the Redis cluster to the specified number of primaries. +// It retries on conflict errors and on webhook validation errors that reject +// the change because the cluster is not yet in 'Ready' status, which is a +// transient condition during chaos operations. func ScaleCluster(ctx context.Context, dc dynamic.Interface, namespace, clusterName string, primaries int32) error { - return retry.RetryOnConflict(retry.DefaultRetry, func() error { - return ScaleRedkeyCluster(ctx, dc, namespace, clusterName, primaries) + return wait.PollUntilContextTimeout(ctx, scaleRetryInterval, scaleRetryTimeout, true, func(ctx context.Context) (bool, error) { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + return ScaleRedkeyCluster(ctx, dc, namespace, clusterName, primaries) + }) + if err == nil { + return true, nil + } + // If the webhook rejected the update because the cluster is not Ready, + // keep polling—the operator will eventually reconcile the cluster back + // to Ready status. + if isNotReadyValidationError(err) { + return false, nil + } + return false, err }) } diff --git a/test/chaos/k6scripts/test-300k.js b/test/chaos/k6scripts/test-300k.js index 54751a4..97f318b 100644 --- a/test/chaos/k6scripts/test-300k.js +++ b/test/chaos/k6scripts/test-300k.js @@ -27,12 +27,12 @@ function generateRandomValue(maxBytes) { return randomBytes(size).toString('base64'); } -export default function () { +export default async function () { const uniqueKey = `mykey_${__VU}_${__ITER}`; const value = generateRandomValue(300000); try { - const setResult = client.set(uniqueKey, value, 30); + const setResult = await client.set(uniqueKey, value, 30); const setOk = check(setResult, { 'redis set succeeds': (result) => result === 'OK', }); @@ -44,7 +44,7 @@ export default function () { // Randomly delete approximately 1 in 10 keys if (Math.random() < 0.1) { - const deleted = client.del(uniqueKey); + const deleted = await client.del(uniqueKey); const deleteOk = check(deleted, { 'redis delete succeeds': (result) => result === 1, }); @@ -54,7 +54,7 @@ export default function () { throw new Error(message); } } else { - const retrievedValue = client.get(uniqueKey); + const retrievedValue = await client.get(uniqueKey); const getOk = check(retrievedValue, { 'redis get returns original value': (result) => result === value, }); From f5e7bd3c84db27211b05e7cfa97797e3f994fd51 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 18 Mar 2026 12:28:05 +0100 Subject: [PATCH 10/20] test: fixing chaos tests --- test/chaos/chaos_suite_test.go | 22 +++------------------- test/chaos/framework/cluster.go | 25 +++++++++++++++++++++++++ test/chaos/framework/redis_chaos.go | 21 ++++++++------------- test/chaos/helpers_test.go | 13 ++++--------- 4 files changed, 40 insertions(+), 41 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index e379de3..163c29b 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -107,19 +107,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) - // Poll for StatefulSet to acknowledge the scale and pods to exist. - // During fast scaling (PurgeKeysOnRebalance=true), the operator deletes and - // recreates the StatefulSet, so we must wait for pods to actually exist - // before attempting to delete them. - Eventually(func() int { - pods, err := k8sClientset.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{ - LabelSelector: framework.RedisPodsSelector(clusterName), - }) - if err != nil { - return 0 - } - return len(pods.Items) - }, scaleAckTimeout, scalePollInterval).Should(BeNumerically(">=", int(newSize))) + Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace.Name, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed()) By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) deleteCount := rng.Intn(int(newSize)/2) + 1 @@ -202,9 +190,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - deletedRobin, err := framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deletedRobin).NotTo(BeEmpty(), "expected at least one robin pod deletion") + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) @@ -248,9 +234,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace.Name)).To(Succeed()) case 1: By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - deleted, err := framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one robin pod deletion") + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) case 2: By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) diff --git a/test/chaos/framework/cluster.go b/test/chaos/framework/cluster.go index feacc7f..56cab7b 100644 --- a/test/chaos/framework/cluster.go +++ b/test/chaos/framework/cluster.go @@ -69,6 +69,31 @@ func GetStatefulSetReplicas(ctx context.Context, clientset kubernetes.Interface, return *sts.Spec.Replicas, nil } +// WaitForScaleAck polls until the StatefulSet has the expected replica count +// and at least that many pods exist. During fast scaling +// (PurgeKeysOnRebalance=true), the operator may delete and recreate the +// StatefulSet, so both conditions must be met before the caller can safely +// interact with the pods. +func WaitForScaleAck(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, expectedReplicas int32, timeout, interval time.Duration) error { + return wait.PollUntilContextTimeout(ctx, interval, timeout, true, func(ctx context.Context) (bool, error) { + replicas, err := GetStatefulSetReplicas(ctx, clientset, namespace, clusterName) + if err != nil { + return false, nil + } + if replicas != expectedReplicas { + return false, nil + } + + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: RedisPodsSelector(clusterName), + }) + if err != nil { + return false, nil + } + return int32(len(pods.Items)) >= expectedReplicas, nil + }) +} + // buildRedkeyCluster constructs a RedkeyCluster object with the given parameters. func buildRedkeyCluster( key types.NamespacedName, diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go index 0f090b4..8e0e3f2 100644 --- a/test/chaos/framework/redis_chaos.go +++ b/test/chaos/framework/redis_chaos.go @@ -77,32 +77,27 @@ func DeleteRandomRedisPods(ctx context.Context, clientset kubernetes.Interface, return deleted, nil } -// DeleteRobinPods deletes N random robin pods from the cluster. -func DeleteRobinPods(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, count int, rng *rand.Rand) ([]string, error) { +// DeleteRobinPods requests deletion of all robin pods for the cluster. +// It only verifies that the Kubernetes delete calls succeed. +func DeleteRobinPods(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string) error { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: RobinPodsSelector(clusterName), }) if err != nil { - return nil, fmt.Errorf("failed to list robin pods: %w", err) + return fmt.Errorf("failed to list robin pods: %w", err) } if len(pods.Items) == 0 { - return nil, nil + return nil } - // Shuffle and pick N pods - indices := rng.Perm(len(pods.Items)) - - var deleted []string - for i := 0; i < count && i < len(indices); i++ { - pod := pods.Items[indices[i]] + for _, pod := range pods.Items { if err := clientset.CoreV1().Pods(namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) { - return deleted, fmt.Errorf("failed to delete robin pod %s: %w", pod.Name, err) + return fmt.Errorf("failed to delete robin pod %s: %w", pod.Name, err) } - deleted = append(deleted, pod.Name) } - return deleted, nil + return nil } // ScaleCluster scales the Redis cluster to the specified number of primaries. diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index 812c7df..e16ac3d 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -60,14 +60,9 @@ func verifyK6Completed(namespace, jobName string, timeout time.Duration) { Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace, jobName, timeout)).To(Succeed()) } -// waitForStatefulSetReplicas polls until the StatefulSet has the expected replica count. +// waitForStatefulSetReplicas polls until the StatefulSet has the expected replica +// count and at least that many pods exist. func waitForStatefulSetReplicas(namespace, clusterName string, expectedReplicas int32) { - Eventually(func() int32 { - replicas, err := framework.GetStatefulSetReplicas(ctx, k8sClientset, namespace, clusterName) - if err != nil { - return -1 - } - return replicas - }, scaleAckTimeout, scalePollInterval).Should(Equal(expectedReplicas), - fmt.Sprintf("StatefulSet should have %d replicas", expectedReplicas)) + Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, expectedReplicas, scaleAckTimeout, scalePollInterval)).To(Succeed(), + fmt.Sprintf("StatefulSet should have %d replicas with pods", expectedReplicas)) } From 16b3715b0b659ccf126fd15f7446866277129377 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Wed, 18 Mar 2026 16:10:13 +0100 Subject: [PATCH 11/20] test: add no-purge-keys tests --- test/chaos/chaos_suite_test.go | 240 +++++++++++++------------------- test/chaos/framework/cluster.go | 6 +- test/chaos/helpers_test.go | 168 ++++++++++++++++++++++ test/chaos/suite_test.go | 21 +-- 4 files changed, 278 insertions(+), 157 deletions(-) diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 163c29b..8e35356 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -38,7 +38,7 @@ const ( defaultVUs = 10 // Number of virtual users for k6 load tests ) -var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { +var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", "load"), func() { var ( namespace *corev1.Namespace k6JobName string @@ -63,7 +63,7 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) By("creating Redis cluster with 5 primaries") - Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries)).To(Succeed()) + Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries, true)).To(Succeed()) By("waiting for cluster to be ready") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) @@ -82,7 +82,11 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { Expect(namespaceName).NotTo(BeEmpty(), "k6 job cleanup requires a namespace") Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) } - Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) + if skipDeleteNamespace && CurrentSpecReport().Failed() { + GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + } else { + Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) + } }) // ================================================================================== @@ -90,174 +94,112 @@ var _ = Describe("Chaos Under Load", Label("chaos", "load"), func() { // PurgeKeysOnRebalance=true --> the StatefulSet is recreated when scaling // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic", func() { - By("starting k6 load job") - var err error - k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) - Expect(err).NotTo(HaveOccurred()) - - By("executing chaos loop") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) - - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) - - By(fmt.Sprintf("iteration %d: scaling cluster up", iteration)) - newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) - - Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace.Name, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed()) - - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - deleteCount := rng.Intn(int(newSize)/2) + 1 - deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, deleteCount, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") - GinkgoWriter.Printf("Deleted pods: %v\n", deleted) - - By(fmt.Sprintf("iteration %d: waiting for cluster recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - - By(fmt.Sprintf("iteration %d: scaling cluster down", iteration)) - downSize := int32(rng.Intn(3) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, downSize)).To(Succeed()) - - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - } - - By("verifying final cluster state") - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) - Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) - - By("verifying k6 job completed successfully") - Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + k6JobName = runScalingChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 2: Chaos with Operator Deletion // ================================================================================== It("recovers when operator pod is deleted during chaos", func() { - By("starting k6 load job") - var err error - k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) - Expect(err).NotTo(HaveOccurred()) - - By("executing chaos with operator deletion") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) - - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) - - By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace.Name)).To(Succeed()) - - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") - - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - - // Rate limit between iterations - time.Sleep(chaosRateLimitDelay) - } - - By("verifying final cluster state") - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + k6JobName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 3: Chaos with Robin Deletion // ================================================================================== It("recovers when robin pods are deleted during chaos", func() { - By("starting k6 load job") + k6JobName = runRobinDeletionChaos(rng, namespace.Name, clusterName) + }) + + // ================================================================================== + // Scenario 4: Full Chaos (Operator + Robin + Redis) + // ================================================================================== + It("recovers from full chaos deleting operator, robin, and redis pods", func() { + k6JobName = runFullChaos(rng, namespace.Name, clusterName) + }) +}) + +// ====================================================================================== +// Chaos Under Load (NoPurge) — same scenarios with PurgeKeysOnRebalance=false +// PurgeKeysOnRebalance=false --> the StatefulSet is updated in place when scaling +// ====================================================================================== +var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", "load", "nopurge"), func() { + var ( + namespace *corev1.Namespace + k6JobName string + rng *rand.Rand + ) + + BeforeEach(func() { var err error - k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) - Expect(err).NotTo(HaveOccurred()) - By("executing chaos with robin deletion") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) + rng = rand.New(rand.NewSource(chaosSeed)) + GinkgoWriter.Printf("Using random seed: %d\n", chaosSeed) - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + namespace, err = framework.CreateNamespace(ctx, k8sClientset, fmt.Sprintf("chaos-np-%d", GinkgoParallelProcess())) + Expect(err).NotTo(HaveOccurred(), "failed to create namespace") - By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) + By("deploying operator in namespace") + Expect(framework.EnsureOperatorSetup(ctx, k8sClientset, namespace.Name)).To(Succeed()) - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deletedRedis).NotTo(BeEmpty(), "expected at least one redis pod deletion") + Eventually(func() bool { + dep, err := k8sClientset.AppsV1().Deployments(namespace.Name).Get(ctx, "redkey-operator", metav1.GetOptions{}) + return err == nil && dep.Status.AvailableReplicas >= 1 + }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) + + By("creating Redis cluster with 5 primaries (PurgeKeysOnRebalance=false)") + Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries, false)).To(Succeed()) - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + By("waiting for cluster to be ready") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) + }) - // Rate limit between iterations - time.Sleep(chaosRateLimitDelay) + AfterEach(func() { + namespaceName := "" + if namespace != nil { + namespaceName = namespace.Name } - By("verifying final cluster state") - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + if CurrentSpecReport().Failed() && namespaceName != "" { + collectDiagnostics(namespace.Name) + } + if k6JobName != "" { + Expect(namespaceName).NotTo(BeEmpty(), "k6 job cleanup requires a namespace") + Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) + } + if skipDeleteNamespace && CurrentSpecReport().Failed() { + GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + } else { + Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) + } }) // ================================================================================== - // Scenario 4: Full Chaos (Operator + Robin + Redis) + // Scenario 1 (NoPurge): Continuous Scaling Under Load and Chaos // ================================================================================== - It("recovers from full chaos deleting operator, robin, and redis pods", func() { - By("starting k6 load job") - var err error - k6JobName, err = framework.StartK6LoadJob(ctx, k8sClientset, namespace.Name, clusterName, chaosDuration, defaultVUs) - Expect(err).NotTo(HaveOccurred()) - - By("executing full chaos") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) - - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Full chaos iteration %d ===\n", iteration) - - action := rng.Intn(4) - - switch action { - case 0: - By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace.Name)).To(Succeed()) - case 1: - By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) - case 2: - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) - deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace.Name, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") - case 3: - By(fmt.Sprintf("iteration %d: scaling cluster", iteration)) - newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace.Name, clusterName, newSize)).To(Succeed()) - } - - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - - // Rate limit between chaos actions - time.Sleep(chaosIterationDelay) - } + It("survives continuous scaling and pod deletion while handling traffic without purge", func() { + k6JobName = runScalingChaos(rng, namespace.Name, clusterName) + }) - By("verifying final cluster state") - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) - Expect(framework.AssertAllSlotsAssigned(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) - Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace.Name, clusterName)).To(Succeed()) - Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace.Name, k6JobName, chaosDuration+k6CompletionBuffer)).To(Succeed()) + // ================================================================================== + // Scenario 2 (NoPurge): Chaos with Operator Deletion + // ================================================================================== + It("recovers when operator pod is deleted during chaos without purge", func() { + k6JobName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) + }) + + // ================================================================================== + // Scenario 3 (NoPurge): Chaos with Robin Deletion + // ================================================================================== + It("recovers when robin pods are deleted during chaos without purge", func() { + k6JobName = runRobinDeletionChaos(rng, namespace.Name, clusterName) + }) + + // ================================================================================== + // Scenario 4 (NoPurge): Full Chaos (Operator + Robin + Redis) + // ================================================================================== + It("recovers from full chaos deleting operator, robin, and redis pods without purge", func() { + k6JobName = runFullChaos(rng, namespace.Name, clusterName) }) }) @@ -281,7 +223,7 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun }, operatorReadyTimeout, operatorPollInterval).Should(BeTrue()) By("creating Redis cluster with 5 primaries") - Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries)).To(Succeed()) + Expect(framework.CreateRedkeyCluster(ctx, dynamicClient, namespace.Name, clusterName, defaultPrimaries, true)).To(Succeed()) By("waiting for cluster to be ready") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace.Name, clusterName, chaosReadyTimeout)).To(Succeed()) @@ -296,7 +238,11 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun if CurrentSpecReport().Failed() && namespaceName != "" { collectDiagnostics(namespace.Name) } - Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) + if skipDeleteNamespace && CurrentSpecReport().Failed() { + GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + } else { + Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) + } }) // ================================================================================== diff --git a/test/chaos/framework/cluster.go b/test/chaos/framework/cluster.go index 56cab7b..5a35931 100644 --- a/test/chaos/framework/cluster.go +++ b/test/chaos/framework/cluster.go @@ -51,9 +51,11 @@ func GetRobinImage() string { } // CreateRedkeyCluster creates a RedkeyCluster CR using dynamic client. -func CreateRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, name string, primaries int32) error { +// When purgeKeys is true the operator deletes and recreates the StatefulSet on +// scaling; when false the StatefulSet is updated in place. +func CreateRedkeyCluster(ctx context.Context, dc dynamic.Interface, namespace, name string, primaries int32, purgeKeys bool) error { key := types.NamespacedName{Namespace: namespace, Name: name} - rc := buildRedkeyCluster(key, primaries, 0, "", GetRedisImage(), true, true, redkeyv1.Pdb{}, redkeyv1.RedkeyClusterOverrideSpec{}) + rc := buildRedkeyCluster(key, primaries, 0, "", GetRedisImage(), purgeKeys, true, redkeyv1.Pdb{}, redkeyv1.RedkeyClusterOverrideSpec{}) return EnsureRedkeyCluster(ctx, dc, rc) } diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index e16ac3d..3e0cf4e 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -6,6 +6,7 @@ package chaos import ( "fmt" + "math/rand" "time" . "github.com/onsi/ginkgo/v2" @@ -66,3 +67,170 @@ func waitForStatefulSetReplicas(namespace, clusterName string, expectedReplicas Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, expectedReplicas, scaleAckTimeout, scalePollInterval)).To(Succeed(), fmt.Sprintf("StatefulSet should have %d replicas with pods", expectedReplicas)) } + +// --------------------------------------------------------------------------- +// Shared chaos scenario bodies +// --------------------------------------------------------------------------- +// Each function runs the full chaos scenario (start k6, chaos loop, verify) +// and returns the k6 job name so the caller can clean it up. + +// runScalingChaos runs the continuous-scaling-and-pod-deletion scenario. +func runScalingChaos(rng *rand.Rand, namespace, clusterName string) string { + By("starting k6 load job") + k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + + By("executing chaos loop") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: scaling cluster up", iteration)) + newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed()) + + Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + deleteCount := rng.Intn(int(newSize)/2) + 1 + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, deleteCount, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + GinkgoWriter.Printf("Deleted pods: %v\n", deleted) + + By(fmt.Sprintf("iteration %d: waiting for cluster recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: scaling cluster down", iteration)) + downSize := int32(rng.Intn(3) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, downSize)).To(Succeed()) + + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + } + + By("verifying final cluster state") + verifyClusterHealthy(namespace, clusterName) + verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) + + return k6JobName +} + +// runOperatorDeletionChaos runs the operator-pod-deletion scenario. +func runOperatorDeletionChaos(rng *rand.Rand, namespace, clusterName string) string { + By("starting k6 load job") + k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + + By("executing chaos with operator deletion") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between iterations + time.Sleep(chaosRateLimitDelay) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) + + return k6JobName +} + +// runRobinDeletionChaos runs the robin-pod-deletion scenario. +func runRobinDeletionChaos(rng *rand.Rand, namespace, clusterName string) string { + By("starting k6 load job") + k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + + By("executing chaos with robin deletion") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + + By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) + + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deletedRedis).NotTo(BeEmpty(), "expected at least one redis pod deletion") + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between iterations + time.Sleep(chaosRateLimitDelay) + } + + By("verifying final cluster state") + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) + + return k6JobName +} + +// runFullChaos runs the full chaos scenario with random operator, robin, redis, +// and scaling actions. +func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { + By("starting k6 load job") + k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + + By("executing full chaos") + endTime := time.Now().Add(chaosDuration - chaosReserveTime) + + iteration := 0 + for time.Now().Before(endTime) { + iteration++ + GinkgoWriter.Printf("=== Full chaos iteration %d ===\n", iteration) + + action := rng.Intn(4) + + switch action { + case 0: + By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed()) + case 1: + By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) + case 2: + By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) + Expect(err).NotTo(HaveOccurred()) + Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + case 3: + By(fmt.Sprintf("iteration %d: scaling cluster", iteration)) + newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed()) + } + + By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + + // Rate limit between chaos actions + time.Sleep(chaosIterationDelay) + } + + By("verifying final cluster state") + verifyClusterHealthy(namespace, clusterName) + verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) + + return k6JobName +} diff --git a/test/chaos/suite_test.go b/test/chaos/suite_test.go index db7f297..9ba41ed 100644 --- a/test/chaos/suite_test.go +++ b/test/chaos/suite_test.go @@ -20,13 +20,14 @@ import ( ) var ( - k8sClientset kubernetes.Interface - dynamicClient dynamic.Interface - ctx context.Context - cancel context.CancelFunc - chaosDuration time.Duration - chaosSeed int64 - chaosReadyTimeout = 10 * time.Minute + k8sClientset kubernetes.Interface + dynamicClient dynamic.Interface + ctx context.Context + cancel context.CancelFunc + chaosDuration time.Duration + chaosSeed int64 + chaosReadyTimeout = 10 * time.Minute + skipDeleteNamespace bool ) func TestChaos(t *testing.T) { @@ -84,7 +85,11 @@ var _ = SynchronizedBeforeSuite( chaosSeed = GinkgoRandomSeed() } - GinkgoWriter.Printf("Chaos test configuration: duration=%v, seed=%d\n", chaosDuration, chaosSeed) + if os.Getenv("CHAOS_SKIP_DELETE_NAMESPACE") != "" { + skipDeleteNamespace = true + } + + GinkgoWriter.Printf("Chaos test configuration: duration=%v, seed=%d, skipDeleteNamespace=%v\n", chaosDuration, chaosSeed, skipDeleteNamespace) }, ) From cf4734ed3e4064051bc27644ad74c81758e71216 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Sat, 21 Mar 2026 17:51:26 +0100 Subject: [PATCH 12/20] test: improve chaos tests --- Makefile | 2 +- scripts/finalizers-clean.sh | 141 ++++++++++++++++++++++++++++++ scripts/get-robin-status.sh | 19 ++++ scripts/remove-test-namespaces.sh | 13 +++ test/chaos/chaos_suite_test.go | 6 +- test/chaos/suite_test.go | 2 +- 6 files changed, 178 insertions(+), 5 deletions(-) create mode 100755 scripts/finalizers-clean.sh create mode 100755 scripts/get-robin-status.sh create mode 100755 scripts/remove-test-namespaces.sh diff --git a/Makefile b/Makefile index b175ae3..95dd593 100644 --- a/Makefile +++ b/Makefile @@ -597,7 +597,7 @@ CHAOS_SEED ?= CHAOS_TIMEOUT ?= 30m CHAOS_PACKAGES ?= ./test/chaos CHAOS_TEST_OUTPUT = .local/chaos-test.json - +# CHAOS_KEEP_NAMESPACE_ON_FAILED=1 # if != "" skip delete namespace if failed .PHONY: k6-build k6-build: ## Build k6 image with xk6-redis extension $(info $(M) building k6 docker image with redis extension) diff --git a/scripts/finalizers-clean.sh b/scripts/finalizers-clean.sh new file mode 100755 index 0000000..f2e7ea4 --- /dev/null +++ b/scripts/finalizers-clean.sh @@ -0,0 +1,141 @@ +#!/usr/bin/env bash +# clean-finalizers.sh +# Find and remove finalizers from all namespaced resources in a namespace. +# Useful when an operator/controller was removed and CRs are stuck terminating. +# +# Requirements: kubectl, jq +# Usage: +# ./clean-finalizers.sh -n # list what would be patched (dry-run) +# ./clean-finalizers.sh -n -y # actually remove finalizers +# ./clean-finalizers.sh -n -k crd1,deployments.apps # limit to specific resource types +# ./clean-finalizers.sh -n --context # use a specific context + +set -euo pipefail + +NS="" +CTX="" +DRYRUN=1 +KINDS_FILTER="" # comma-separated kubectl resource names, e.g. "foos.example.com,widgets.example.com" +ONLY_LIST=0 + +usage() { + cat <<'USAGE' +clean-finalizers.sh - remove finalizers from namespaced resources + +Options: + -n, --namespace Target namespace (required) + -y, --yes Apply changes (default is dry-run: list only) + -k, --kinds Comma-separated resource names to limit (e.g. "foos.example.com,deployments.apps") + --context kubectl context to use + -h, --help Show this help + +Examples: + ./clean-finalizers.sh -n my-ns + ./clean-finalizers.sh -n my-ns -y + ./clean-finalizers.sh -n my-ns -k foos.example.com -y +USAGE +} + +# --- args --- +while (( "$#" )); do + case "${1}" in + -n|--namespace) NS="${2:-}"; shift 2;; + --context) CTX="${2:-}"; shift 2;; + -y|--yes) DRYRUN=0; shift;; + -k|--kinds) KINDS_FILTER="${2:-}"; shift 2;; + -h|--help) usage; exit 0;; + *) echo "Unknown arg: $1"; usage; exit 1;; + esac +done + +if [[ -z "${NS}" ]]; then + echo "ERROR: --namespace is required"; usage; exit 1 +fi + +if [[ "${NS}" == "kube-system" || "${NS}" == "kube-public" ]]; then + echo "Refusing to operate on protected namespace: ${NS}" >&2 + exit 1 +fi + +K="kubectl" +if [[ -n "${CTX}" ]]; then + K="${K} --context ${CTX}" +fi + +# Verify tools +command -v jq >/dev/null 2>&1 || { echo "jq is required"; exit 1; } +${K} version >/dev/null 2>&1 || { echo "kubectl not configured?"; exit 1; } + +# Build resource list +if [[ -n "${KINDS_FILTER}" ]]; then + IFS=',' read -r -a RES_LIST <<< "${KINDS_FILTER}" +else + # all namespaced, listable resources (exclude subresources like */status) + mapfile -t RES_LIST < <(${K} api-resources --namespaced=true --verbs=list --output=name | grep -v '/') +fi + +echo "Namespace: ${NS}" +echo "Context: ${CTX:-(default)}" +echo "Mode: $([[ ${DRYRUN} -eq 1 ]] && echo 'DRY-RUN (no changes)' || echo 'APPLY CHANGES')" +echo + +patched_count=0 +error_count=0 + +patch_finalizers() { + local res="$1" name="$2" + # Try several patch strategies; some APIs accept one form but not others. + if ${K} -n "${NS}" patch "${res}" "${name}" --type=merge -p '{"metadata":{"finalizers":[]}}' >/dev/null 2>&1; then + return 0 + fi + if ${K} -n "${NS}" patch "${res}" "${name}" --type=merge -p '{"metadata":{"finalizers":null}}' >/dev/null 2>&1; then + return 0 + fi + if ${K} -n "${NS}" patch "${res}" "${name}" --type=json -p='[{"op":"remove","path":"/metadata/finalizers"}]' >/dev/null 2>&1; then + return 0 + fi + return 1 +} + +kubectl -n "$NS" delete pod --all --grace-period=0 --force + +for res in "${RES_LIST[@]}"; do + # Fetch items with non-empty finalizers + json="$(${K} -n "${NS}" get "${res}" -o json 2>/dev/null || true)" + [[ -z "${json}" || "${json}" == "null" ]] && continue + + mapfile -t ITEMS < <(jq -r ' + .items // [] + | map(select(.metadata.finalizers and ((.metadata.finalizers | type) == "array") and (.metadata.finalizers | length > 0))) + | .[] + | [ .metadata.name, (.metadata.finalizers | join(",")) ] + | @tsv + ' <<<"${json}") + + + [[ ${#ITEMS[@]} -eq 0 ]] && continue + + echo "Resource: ${res}" + for line in "${ITEMS[@]}"; do + name="${line%%$'\t'*}" + fins="${line#*$'\t'}" + echo " - ${res}/${name} finalizers=[${fins}]" + + if [[ ${DRYRUN} -eq 0 ]]; then + if patch_finalizers "${res}" "${name}"; then + echo " ✓ removed finalizers" + ((patched_count++)) + else + echo " ✗ failed to patch; you may need RBAC permissions or to use cluster-admin" + ((error_count++)) + fi + fi + done + echo +done + +if [[ ${DRYRUN} -eq 1 ]]; then + echo "Dry-run complete. Re-run with -y to remove the listed finalizers." +else + echo "Done. Patched objects: ${patched_count}. Failures: ${error_count}." +fi diff --git a/scripts/get-robin-status.sh b/scripts/get-robin-status.sh new file mode 100755 index 0000000..cee0fef --- /dev/null +++ b/scripts/get-robin-status.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Exit on errors +set -o errexit +set -o pipefail +set -o nounset + +set +o errexit +set +o pipefail +while sleep 2; do + ns=$(kubectl get namespace 2> /dev/null | grep redis | cut -f1 -d" " ) + p=$(kubectl get pod -n $ns 2> /dev/null | grep robin | cut -f1 -d" " 2> /dev/null) + echo -n "RKCLUST STATUS: " + kubectl exec $p -n $ns -- curl -s http://localhost:8080/v1/redkeycluster/status 2> /dev/null + echo + echo -n "CLUSTER STATUS: " + kubectl exec $p -n $ns -- curl -s http://localhost:8080/v1/cluster/status 2> /dev/null + echo + kubectl get rkcl -n $ns 2> /dev/null +done diff --git a/scripts/remove-test-namespaces.sh b/scripts/remove-test-namespaces.sh new file mode 100755 index 0000000..c230d9b --- /dev/null +++ b/scripts/remove-test-namespaces.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Exit on errors +set -o errexit +set -o pipefail +set -o nounset + +# scripts directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +for i in $(kubectl get ns -o json | jq -r '.items[]| select(.metadata.name | test("redkey-e2e|chaos")) |.metadata.name'); do + $SCRIPT_DIR/finalizers-clean.sh -y -n $i & + kubectl delete ns/$i & +done diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 8e35356..58b9933 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -83,7 +83,7 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) } if skipDeleteNamespace && CurrentSpecReport().Failed() { - GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + GinkgoWriter.Printf("CHAOS_KEEP_NAMESPACE_ON_FAILED is set and spec failed — preserving namespace %s for inspection\n", namespaceName) } else { Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) } @@ -168,7 +168,7 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) } if skipDeleteNamespace && CurrentSpecReport().Failed() { - GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + GinkgoWriter.Printf("CHAOS_KEEP_NAMESPACE_ON_FAILED is set and spec failed — preserving namespace %s for inspection\n", namespaceName) } else { Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) } @@ -239,7 +239,7 @@ var _ = Describe("Topology Corruption Recovery", Label("chaos", "topology"), fun collectDiagnostics(namespace.Name) } if skipDeleteNamespace && CurrentSpecReport().Failed() { - GinkgoWriter.Printf("CHAOS_SKIP_DELETE_NAMESPACE is set and spec failed — preserving namespace %s for inspection\n", namespaceName) + GinkgoWriter.Printf("CHAOS_KEEP_NAMESPACE_ON_FAILED is set and spec failed — preserving namespace %s for inspection\n", namespaceName) } else { Expect(framework.DeleteNamespace(ctx, k8sClientset, dynamicClient, namespace)).To(Succeed(), "failed to clean up namespace %s", namespaceName) } diff --git a/test/chaos/suite_test.go b/test/chaos/suite_test.go index 9ba41ed..cdee726 100644 --- a/test/chaos/suite_test.go +++ b/test/chaos/suite_test.go @@ -85,7 +85,7 @@ var _ = SynchronizedBeforeSuite( chaosSeed = GinkgoRandomSeed() } - if os.Getenv("CHAOS_SKIP_DELETE_NAMESPACE") != "" { + if os.Getenv("CHAOS_KEEP_NAMESPACE_ON_FAILED") != "" { skipDeleteNamespace = true } From 251b28752999931ceaef6182cc88a59471dac697 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Tue, 24 Mar 2026 14:45:17 +0100 Subject: [PATCH 13/20] refactor: k6.Doc same images as operator --- test/chaos/k6.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/chaos/k6.Dockerfile b/test/chaos/k6.Dockerfile index 8e6923b..6122b8f 100644 --- a/test/chaos/k6.Dockerfile +++ b/test/chaos/k6.Dockerfile @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -FROM golang:1.25-alpine AS builder +FROM golang:1.25.7-trixie AS builder # install git and basic build tools so xk6 can fetch & build extensions RUN apk add --no-cache git build-base ca-certificates @@ -12,7 +12,7 @@ RUN xk6 build \ --with github.com/grafana/xk6-redis \ --output /k6 -FROM alpine:3.23 +FROM debian:trixie-slim AS final COPY --from=builder /k6 /usr/bin/k6 COPY k6scripts/ /scripts/ ENTRYPOINT ["/usr/bin/k6"] From b876dfce445b8916af04d260b40b61c463cf3aac Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Tue, 24 Mar 2026 16:02:15 +0100 Subject: [PATCH 14/20] fix: change go version --- .tool-versions | 2 +- Dockerfile | 2 +- Makefile | 2 +- test/chaos/k6.Dockerfile | 8 ++++++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.tool-versions b/.tool-versions index de62786..7c67a05 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1,3 +1,3 @@ # SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL S.A. (INDITEX S.A.) # SPDX-License-Identifier: Apache-2.0 -golang 1.25.7 +golang 1.25.8 diff --git a/Dockerfile b/Dockerfile index 8ec5e5b..00af994 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ ### Build stage # Define the desired Golang version -ARG GOLANG_VERSION=1.25.7 +ARG GOLANG_VERSION=1.25.8 # Use an official Golang image with a specific version based on Debian FROM golang:${GOLANG_VERSION}-trixie AS builder diff --git a/Makefile b/Makefile index 95dd593..0be9079 100644 --- a/Makefile +++ b/Makefile @@ -609,7 +609,7 @@ k6-push: k6-build ## Push k6 image to local registry docker push $(K6_IMG) .PHONY: test-chaos -test-chaos: process-manifests-crd ginkgo k6-push ## Execute chaos tests +test-chaos: process-manifests-crd ginkgo ## Execute chaos tests $(info $(M) running chaos tests...) @mkdir -p $(dir $(CHAOS_TEST_OUTPUT)) $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_DURATION=$(CHAOS_DURATION) \ diff --git a/test/chaos/k6.Dockerfile b/test/chaos/k6.Dockerfile index 6122b8f..a58f16c 100644 --- a/test/chaos/k6.Dockerfile +++ b/test/chaos/k6.Dockerfile @@ -2,10 +2,14 @@ # # SPDX-License-Identifier: Apache-2.0 -FROM golang:1.25.7-trixie AS builder +# Define the desired Golang version +ARG GOLANG_VERSION=1.25.8 + + +FROM golang:${GOLANG_VERSION}-trixie AS builder # install git and basic build tools so xk6 can fetch & build extensions -RUN apk add --no-cache git build-base ca-certificates +RUN apt update && apt upgrade -y && apt install -y curl procps build-essential ca-certificates RUN go install go.k6.io/xk6/cmd/xk6@latest RUN xk6 build \ From 17263e460aaeef3895e692c5f1d6ecc75734d735 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Thu, 26 Mar 2026 12:05:33 +0100 Subject: [PATCH 15/20] test: improve chaos testing --- Makefile | 10 +- README.md | 2 +- debug.Dockerfile | 2 +- go.mod | 2 +- scripts/get-robin-status.sh | 9 +- scripts/report-test.py | 48 ++------ test/chaos/chaos_suite_test.go | 32 +++-- test/chaos/framework/k6.go | 160 ++++++++----------------- test/chaos/helpers_test.go | 207 +++++++++++++++------------------ test/chaos/suite_test.go | 14 +-- 10 files changed, 194 insertions(+), 292 deletions(-) diff --git a/Makefile b/Makefile index 0be9079..921fa8a 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ SHELL := /bin/bash NAME := redkey-operator VERSION := 0.1.0 ROBIN_VERSION := 0.1.0 -GOLANG_VERSION := 1.25.7 +GOLANG_VERSION := 1.25.8 DELVE_VERSION := 1.25 ## Tool Versions @@ -592,16 +592,16 @@ test-e2e-cov: process-manifests-crd ginkgo ## Execute e2e application test with ##@ Chaos Testing K6_IMG ?= localhost:5001/redkey-k6:dev -CHAOS_DURATION ?= 10m +CHAOS_ITERATIONS ?= 3 CHAOS_SEED ?= -CHAOS_TIMEOUT ?= 30m +CHAOS_TIMEOUT ?= 60m CHAOS_PACKAGES ?= ./test/chaos CHAOS_TEST_OUTPUT = .local/chaos-test.json # CHAOS_KEEP_NAMESPACE_ON_FAILED=1 # if != "" skip delete namespace if failed .PHONY: k6-build k6-build: ## Build k6 image with xk6-redis extension $(info $(M) building k6 docker image with redis extension) - docker build -t $(K6_IMG) -f test/chaos/k6.Dockerfile test/chaos + docker build --build-arg GOLANG_VERSION=$(GOLANG_VERSION) -t $(K6_IMG) -f test/chaos/k6.Dockerfile test/chaos .PHONY: k6-push k6-push: k6-build ## Push k6 image to local registry @@ -612,7 +612,7 @@ k6-push: k6-build ## Push k6 image to local registry test-chaos: process-manifests-crd ginkgo ## Execute chaos tests $(info $(M) running chaos tests...) @mkdir -p $(dir $(CHAOS_TEST_OUTPUT)) - $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_DURATION=$(CHAOS_DURATION) \ + $(GINKGO_ENV) K6_IMG=$(K6_IMG) CHAOS_ITERATIONS=$(CHAOS_ITERATIONS) \ $(if $(CHAOS_SEED),CHAOS_SEED=$(CHAOS_SEED),) \ ginkgo \ --timeout=$(CHAOS_TIMEOUT) \ diff --git a/README.md b/README.md index 938521e..4699a89 100644 --- a/README.md +++ b/README.md @@ -111,7 +111,7 @@ Contributions are welcome! Please read our [contributing guidelines](./CONTRIBUT ## Versions -- Go version (https://github.com/golang/go): v1.25.7 +- Go version (https://github.com/golang/go): v1.25.8 - Operator SDK version (https://github.com/operator-framework/operator-sdk): v1.42.0 - Kubernetes Controller Tools version (https://github.com/kubernetes-sigs/controller-tools): v0.18.0 diff --git a/debug.Dockerfile b/debug.Dockerfile index 3a8e1c2..e57b393 100644 --- a/debug.Dockerfile +++ b/debug.Dockerfile @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 # Define the desired Golang version -ARG GOLANG_VERSION=1.25.7 +ARG GOLANG_VERSION=1.25.8 # Use an official Golang image with a specific version based on Debian FROM golang:${GOLANG_VERSION}-trixie diff --git a/go.mod b/go.mod index 8546886..da9129f 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 module github.com/inditextech/redkeyoperator -go 1.25.7 +go 1.25.8 require ( github.com/go-logr/logr v1.4.2 diff --git a/scripts/get-robin-status.sh b/scripts/get-robin-status.sh index cee0fef..f54c87e 100755 --- a/scripts/get-robin-status.sh +++ b/scripts/get-robin-status.sh @@ -6,8 +6,13 @@ set -o nounset set +o errexit set +o pipefail -while sleep 2; do - ns=$(kubectl get namespace 2> /dev/null | grep redis | cut -f1 -d" " ) + +if (( $# > 0 )); then + nss=$1 +else + nss=$(kubectl get namespace 2> /dev/null | grep -E '(redis|chaos)' | cut -f1 -d" " ) +fi +for ns in $nss; do p=$(kubectl get pod -n $ns 2> /dev/null | grep robin | cut -f1 -d" " 2> /dev/null) echo -n "RKCLUST STATUS: " kubectl exec $p -n $ns -- curl -s http://localhost:8080/v1/redkeycluster/status 2> /dev/null diff --git a/scripts/report-test.py b/scripts/report-test.py index 1b10063..a57c6b5 100755 --- a/scripts/report-test.py +++ b/scripts/report-test.py @@ -4,6 +4,7 @@ import json import sys import pathlib +import os def escape_md(text: str) -> str: """Escape special characters for markdown.""" @@ -119,48 +120,23 @@ def generate_md_report(data: dict) -> str: return "\n".join(lines) -def unique(path: str) -> str: - path = pathlib.Path(path) - - if not path.exists(): - return path - - stem = path.stem - suffix = path.suffix - parent = path.parent - - i = 1 - while True: - candidate = parent / f"{stem}_{i}{suffix}" - if not candidate.exists(): - return candidate - i += 1 - - - - def main(): - if len(sys.argv) < 2: - input_file = ".local/results.json" - else: - input_file = sys.argv[1] - - output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace(".json", ".md") - - output_file = unique(output_file) + input_files = [f for f in [".local/results.json", ".local/chaos-test.json"] if os.path.exists(f)] - print(f"Reading: {input_file}") - with open(input_file, "r") as f: - data = json.load(f) + for input_file in input_files: + output_file = input_file.replace(".json", ".md") - md_content = generate_md_report(data) + print(f"Reading: {input_file}") + with open(input_file, "r") as f: + data = json.load(f) - print(f"Writing: {output_file}") - with open(output_file, "w") as f: - f.write(md_content) + md_content = generate_md_report(data) - print("Done!") + print(f"Writing: {output_file}") + with open(output_file, "w") as f: + f.write(md_content) + print("Done!") if __name__ == "__main__": main() diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 58b9933..7f935e1 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -24,8 +24,6 @@ const ( // Chaos timing constants chaosIterationDelay = 5 * time.Second // Delay between chaos iterations chaosRateLimitDelay = 10 * time.Second // Delay for rate limiting between heavy operations - chaosReserveTime = 1 * time.Minute // Time reserved at end of chaos for final checks - k6CompletionBuffer = 5 * time.Minute // Buffer time for k6 job completion operatorReadyTimeout = 2 * time.Minute // Timeout for operator to become ready operatorPollInterval = 5 * time.Second // Poll interval for operator readiness scaleAckTimeout = 30 * time.Second // Timeout for StatefulSet to acknowledge scale @@ -41,7 +39,7 @@ const ( var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", "load"), func() { var ( namespace *corev1.Namespace - k6JobName string + k6DepName string rng *rand.Rand ) @@ -78,9 +76,8 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", if CurrentSpecReport().Failed() && namespaceName != "" { collectDiagnostics(namespace.Name) } - if k6JobName != "" { - Expect(namespaceName).NotTo(BeEmpty(), "k6 job cleanup requires a namespace") - Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) + if k6DepName != "" && namespaceName != "" { + Expect(framework.StopK6Load(ctx, k8sClientset, namespaceName, k6DepName)).To(Succeed(), "failed to clean up k6 deployment %s in namespace %s", k6DepName, namespaceName) } if skipDeleteNamespace && CurrentSpecReport().Failed() { GinkgoWriter.Printf("CHAOS_KEEP_NAMESPACE_ON_FAILED is set and spec failed — preserving namespace %s for inspection\n", namespaceName) @@ -94,28 +91,28 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", // PurgeKeysOnRebalance=true --> the StatefulSet is recreated when scaling // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic", func() { - k6JobName = runScalingChaos(rng, namespace.Name, clusterName) + k6DepName = runScalingChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 2: Chaos with Operator Deletion // ================================================================================== It("recovers when operator pod is deleted during chaos", func() { - k6JobName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) + k6DepName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 3: Chaos with Robin Deletion // ================================================================================== It("recovers when robin pods are deleted during chaos", func() { - k6JobName = runRobinDeletionChaos(rng, namespace.Name, clusterName) + k6DepName = runRobinDeletionChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 4: Full Chaos (Operator + Robin + Redis) // ================================================================================== It("recovers from full chaos deleting operator, robin, and redis pods", func() { - k6JobName = runFullChaos(rng, namespace.Name, clusterName) + k6DepName = runFullChaos(rng, namespace.Name, clusterName) }) }) @@ -126,7 +123,7 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", "load", "nopurge"), func() { var ( namespace *corev1.Namespace - k6JobName string + k6DepName string rng *rand.Rand ) @@ -163,9 +160,8 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", if CurrentSpecReport().Failed() && namespaceName != "" { collectDiagnostics(namespace.Name) } - if k6JobName != "" { - Expect(namespaceName).NotTo(BeEmpty(), "k6 job cleanup requires a namespace") - Expect(framework.DeleteK6Job(ctx, k8sClientset, namespaceName, k6JobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", k6JobName, namespaceName) + if k6DepName != "" && namespaceName != "" { + Expect(framework.StopK6Load(ctx, k8sClientset, namespaceName, k6DepName)).To(Succeed(), "failed to clean up k6 deployment %s in namespace %s", k6DepName, namespaceName) } if skipDeleteNamespace && CurrentSpecReport().Failed() { GinkgoWriter.Printf("CHAOS_KEEP_NAMESPACE_ON_FAILED is set and spec failed — preserving namespace %s for inspection\n", namespaceName) @@ -178,28 +174,28 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", // Scenario 1 (NoPurge): Continuous Scaling Under Load and Chaos // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic without purge", func() { - k6JobName = runScalingChaos(rng, namespace.Name, clusterName) + k6DepName = runScalingChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 2 (NoPurge): Chaos with Operator Deletion // ================================================================================== It("recovers when operator pod is deleted during chaos without purge", func() { - k6JobName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) + k6DepName = runOperatorDeletionChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 3 (NoPurge): Chaos with Robin Deletion // ================================================================================== It("recovers when robin pods are deleted during chaos without purge", func() { - k6JobName = runRobinDeletionChaos(rng, namespace.Name, clusterName) + k6DepName = runRobinDeletionChaos(rng, namespace.Name, clusterName) }) // ================================================================================== // Scenario 4 (NoPurge): Full Chaos (Operator + Robin + Redis) // ================================================================================== It("recovers from full chaos deleting operator, robin, and redis pods without purge", func() { - k6JobName = runFullChaos(rng, namespace.Name, clusterName) + k6DepName = runFullChaos(rng, namespace.Name, clusterName) }) }) diff --git a/test/chaos/framework/k6.go b/test/chaos/framework/k6.go index 49bf8a3..4c15427 100644 --- a/test/chaos/framework/k6.go +++ b/test/chaos/framework/k6.go @@ -12,7 +12,7 @@ import ( "strings" "time" - batchv1 "k8s.io/api/batch/v1" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,12 +23,20 @@ import ( const ( defaultK6Image = "localhost:5001/redkey-k6:dev" - k6JobTimeout = 30 * time.Minute k6StartupTimeout = 2 * time.Minute + k6StopTimeout = 30 * time.Second defaultK6VUs = 10 k6LogTailLines = int64(200) + // k6 runs with a very long duration so it keeps generating load until + // the test explicitly stops it by deleting the deployment. + k6RunDuration = "24h" ) +// K6LoadSelector returns the label selector for k6 load pods. +func K6LoadSelector() string { + return "app=k6-load" +} + // GetK6Image returns the k6 image from environment or default. func GetK6Image() string { if img := os.Getenv("K6_IMG"); img != "" { @@ -37,9 +45,10 @@ func GetK6Image() string { return defaultK6Image } -// StartK6LoadJob creates and starts a k6 Job for load testing. -// Returns the job name. -func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, duration time.Duration, vus int) (string, error) { +// StartK6LoadDeployment creates a k6 Deployment that generates continuous +// load against the Redis cluster. The deployment keeps running until +// explicitly stopped via StopK6Load. Returns the deployment name. +func StartK6LoadDeployment(ctx context.Context, clientset kubernetes.Interface, namespace, clusterName string, vus int) (string, error) { if vus <= 0 { vus = defaultK6VUs } @@ -50,24 +59,28 @@ func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespa return "", fmt.Errorf("failed to get redis hosts: %w", err) } - jobName := fmt.Sprintf("k6-load-%s", clusterName) + deployName := fmt.Sprintf("k6-load-%s", clusterName) + labels := map[string]string{ + "app": "k6-load", + "redkey-cluster-name": clusterName, + } - job := &batchv1.Job{ + dep := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ - Name: jobName, + Name: deployName, Namespace: namespace, }, - Spec: batchv1.JobSpec{ - BackoffLimit: ptr.To(int32(0)), + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To(int32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: labels, + }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - "app": "k6-load", - "redkey-cluster-name": clusterName, - }, + Labels: labels, }, Spec: corev1.PodSpec{ - RestartPolicy: corev1.RestartPolicyNever, + RestartPolicy: corev1.RestartPolicyAlways, Containers: []corev1.Container{ { Name: "k6", @@ -75,7 +88,7 @@ func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespa Args: []string{ "run", "/scripts/test-300k.js", - "--duration", formatDuration(duration), + "--duration", k6RunDuration, "--vus", fmt.Sprintf("%d", vus), }, Env: []corev1.EnvVar{ @@ -91,26 +104,26 @@ func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespa }, } - // Delete existing job if present + // Delete existing deployment if present propagation := metav1.DeletePropagationForeground - _ = clientset.BatchV1().Jobs(namespace).Delete(ctx, jobName, metav1.DeleteOptions{ + _ = clientset.AppsV1().Deployments(namespace).Delete(ctx, deployName, metav1.DeleteOptions{ PropagationPolicy: &propagation, }) // Wait for deletion - _ = wait.PollUntilContextTimeout(ctx, time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { - _, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) + _ = wait.PollUntilContextTimeout(ctx, time.Second, k6StopTimeout, true, func(ctx context.Context) (bool, error) { + _, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deployName, metav1.GetOptions{}) return errors.IsNotFound(err), nil }) - if _, err := clientset.BatchV1().Jobs(namespace).Create(ctx, job, metav1.CreateOptions{}); err != nil { - return "", fmt.Errorf("failed to create k6 job: %w", err) + if _, err := clientset.AppsV1().Deployments(namespace).Create(ctx, dep, metav1.CreateOptions{}); err != nil { + return "", fmt.Errorf("failed to create k6 deployment: %w", err) } - // Wait for job pod to start + // Wait for at least one pod to be running err = wait.PollUntilContextTimeout(ctx, 2*time.Second, k6StartupTimeout, true, func(ctx context.Context) (bool, error) { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: "app=k6-load", + LabelSelector: K6LoadSelector(), }) if err != nil { return false, nil @@ -123,55 +136,21 @@ func StartK6LoadJob(ctx context.Context, clientset kubernetes.Interface, namespa return false, nil }) if err != nil { - return jobName, fmt.Errorf("k6 job pod did not start: %w", err) - } - - return jobName, nil -} - -// WaitForK6JobCompletion waits for the k6 job to complete successfully. -func WaitForK6JobCompletion(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string, timeout time.Duration) error { - if timeout == 0 { - timeout = k6JobTimeout + return deployName, fmt.Errorf("k6 deployment pod did not start: %w", err) } - return wait.PollUntilContextTimeout(ctx, 5*time.Second, timeout, true, func(ctx context.Context) (bool, error) { - job, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) - if err != nil { - if errors.IsNotFound(err) { - return false, fmt.Errorf("k6 job not found") - } - return false, nil - } - - for _, condition := range job.Status.Conditions { - if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { - // k6 exited 0, meaning all thresholds (including checks rate>0.99) passed. - // Transient [K6_ERROR] entries are expected during chaos scenarios - // (pod deletions, scaling) and are already accounted for by the threshold. - return true, nil - } - if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { - logs, logsErr := GetK6JobLogs(ctx, clientset, namespace, jobName) - if logsErr != nil { - return false, fmt.Errorf("k6 job failed: %s (log inspection failed: %v)", condition.Message, logsErr) - } - return false, fmt.Errorf("k6 job failed: %s; logs: %s", condition.Message, summarizeK6Logs(logs)) - } - } - - return false, nil - }) + return deployName, nil } -// DeleteK6Job deletes the k6 job and its pods. -func DeleteK6Job(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string) error { - if jobName == "" { +// StopK6Load deletes the k6 load deployment and waits for its pods to +// terminate. It is safe to call with an empty name. +func StopK6Load(ctx context.Context, clientset kubernetes.Interface, namespace, deployName string) error { + if deployName == "" { return nil } propagation := metav1.DeletePropagationForeground - err := clientset.BatchV1().Jobs(namespace).Delete(ctx, jobName, metav1.DeleteOptions{ + err := clientset.AppsV1().Deployments(namespace).Delete(ctx, deployName, metav1.DeleteOptions{ PropagationPolicy: &propagation, }) if errors.IsNotFound(err) { @@ -181,8 +160,8 @@ func DeleteK6Job(ctx context.Context, clientset kubernetes.Interface, namespace, return err } - return wait.PollUntilContextTimeout(ctx, time.Second, 30*time.Second, true, func(ctx context.Context) (bool, error) { - _, err := clientset.BatchV1().Jobs(namespace).Get(ctx, jobName, metav1.GetOptions{}) + return wait.PollUntilContextTimeout(ctx, time.Second, k6StopTimeout, true, func(ctx context.Context) (bool, error) { + _, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deployName, metav1.GetOptions{}) if errors.IsNotFound(err) { return true, nil } @@ -193,25 +172,15 @@ func DeleteK6Job(ctx context.Context, clientset kubernetes.Interface, namespace, }) } -// GetK6JobLogs returns the logs from the k6 job pod. -func GetK6JobLogs(ctx context.Context, clientset kubernetes.Interface, namespace, jobName string) (string, error) { +// GetK6Logs returns the logs from a running k6 load pod. +func GetK6Logs(ctx context.Context, clientset kubernetes.Interface, namespace string) (string, error) { pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: fmt.Sprintf("app=k6-load,job-name=%s", jobName), + LabelSelector: K6LoadSelector(), }) if err != nil { return "", err } - if len(pods.Items) == 0 { - // Try alternative label selector - pods, err = clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ - LabelSelector: "app=k6-load", - }) - if err != nil { - return "", err - } - } - if len(pods.Items) == 0 { return "", fmt.Errorf("no k6 pods found") } @@ -225,13 +194,13 @@ func GetK6JobLogs(ctx context.Context, clientset kubernetes.Interface, namespace req := clientset.CoreV1().Pods(namespace).GetLogs(pod.Name, opts) stream, err := req.Stream(ctx) if err != nil { - return fmt.Sprintf("Pod %s completed but failed to get logs: %v", pod.Name, err), nil + return fmt.Sprintf("Pod %s running but failed to get logs: %v", pod.Name, err), nil } defer stream.Close() var buf strings.Builder if _, err := io.Copy(&buf, stream); err != nil { - return fmt.Sprintf("Pod %s completed but failed to read logs: %v", pod.Name, err), nil + return fmt.Sprintf("Pod %s running but failed to read logs: %v", pod.Name, err), nil } return buf.String(), nil @@ -267,30 +236,3 @@ func getRedisHosts(ctx context.Context, clientset kubernetes.Interface, namespac return strings.Join(hosts, ","), nil } - -// formatDuration formats a duration for k6 (e.g., "10m", "1h30m"). -func formatDuration(d time.Duration) string { - if d < time.Minute { - return fmt.Sprintf("%ds", int(d.Seconds())) - } - if d < time.Hour { - return fmt.Sprintf("%dm", int(d.Minutes())) - } - hours := int(d.Hours()) - minutes := int(d.Minutes()) % 60 - if minutes == 0 { - return fmt.Sprintf("%dh", hours) - } - return fmt.Sprintf("%dh%dm", hours, minutes) -} - -func summarizeK6Logs(logs string) string { - trimmed := strings.TrimSpace(logs) - if trimmed == "" { - return "" - } - if len(trimmed) > 1500 { - return trimmed[len(trimmed)-1500:] - } - return trimmed -} diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index 3e0cf4e..81daba4 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -15,32 +15,19 @@ import ( "github.com/inditextech/redkeyoperator/test/chaos/framework" ) -// startK6OrFail starts a k6 load job and fails the test if it errors. -func startK6OrFail(namespace, clusterName string, duration time.Duration, vus int) string { - jobName, err := framework.StartK6LoadJob(ctx, k8sClientset, namespace, clusterName, duration, vus) - Expect(err).NotTo(HaveOccurred(), "failed to start k6 job") - return jobName +// startK6OrFail starts a k6 load deployment and fails the test if it errors. +func startK6OrFail(namespace, clusterName string, vus int) string { + depName, err := framework.StartK6LoadDeployment(ctx, k8sClientset, namespace, clusterName, vus) + Expect(err).NotTo(HaveOccurred(), "failed to start k6 load deployment") + return depName } -// cleanupK6Job deletes a k6 job and fails the spec if cleanup fails. -func cleanupK6Job(namespace, jobName string) { - if jobName == "" { +// stopK6Load stops the k6 load deployment and fails the spec if cleanup fails. +func stopK6Load(namespace, depName string) { + if depName == "" { return } - Expect(framework.DeleteK6Job(ctx, k8sClientset, namespace, jobName)).To(Succeed(), "failed to clean up k6 job %s in namespace %s", jobName, namespace) -} - -// chaosLoop runs a chaos function repeatedly until the duration expires. -// Reserves time at the end for final checks. -func chaosLoop(duration time.Duration, chaosFn func(iteration int)) { - endTime := time.Now().Add(duration - chaosReserveTime) - iteration := 0 - - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) - chaosFn(iteration) - } + Expect(framework.StopK6Load(ctx, k8sClientset, namespace, depName)).To(Succeed(), "failed to stop k6 deployment %s in namespace %s", depName, namespace) } // verifyClusterHealthy runs all cluster health checks. @@ -55,182 +42,178 @@ func verifyClusterHealthy(namespace, clusterName string) { Expect(framework.AssertNoNodesInFailState(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) } -// verifyK6Completed waits for k6 job to complete successfully. -func verifyK6Completed(namespace, jobName string, timeout time.Duration) { - By("verifying k6 job completed successfully") - Expect(framework.WaitForK6JobCompletion(ctx, k8sClientset, namespace, jobName, timeout)).To(Succeed()) -} - -// waitForStatefulSetReplicas polls until the StatefulSet has the expected replica -// count and at least that many pods exist. -func waitForStatefulSetReplicas(namespace, clusterName string, expectedReplicas int32) { - Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, expectedReplicas, scaleAckTimeout, scalePollInterval)).To(Succeed(), - fmt.Sprintf("StatefulSet should have %d replicas with pods", expectedReplicas)) -} - // --------------------------------------------------------------------------- // Shared chaos scenario bodies // --------------------------------------------------------------------------- // Each function runs the full chaos scenario (start k6, chaos loop, verify) -// and returns the k6 job name so the caller can clean it up. +// and returns the k6 deployment name so the caller can clean it up. // runScalingChaos runs the continuous-scaling-and-pod-deletion scenario. func runScalingChaos(rng *rand.Rand, namespace, clusterName string) string { - By("starting k6 load job") - k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + By("starting k6 load deployment") + k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) - By("executing chaos loop") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) + By(fmt.Sprintf("executing chaos loop (%d iterations)", chaosIterations)) - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + for i := 1; i <= chaosIterations; i++ { + GinkgoWriter.Printf("=== Chaos iteration %d/%d ===\n", i, chaosIterations) - By(fmt.Sprintf("iteration %d: scaling cluster up", iteration)) + By(fmt.Sprintf("iteration %d/%d: scaling cluster up", i, chaosIterations)) newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed()) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to scale cluster up to %d", i, chaosIterations, newSize)) - Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed()) + Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: StatefulSet did not acknowledge scale to %d", i, chaosIterations, newSize)) - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deleteCount := rng.Intn(int(newSize)/2) + 1 deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, deleteCount, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) + Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) GinkgoWriter.Printf("Deleted pods: %v\n", deleted) - By(fmt.Sprintf("iteration %d: waiting for cluster recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: waiting for cluster recovery", i, chaosIterations)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: cluster did not recover after pod deletion", i, chaosIterations)) - By(fmt.Sprintf("iteration %d: scaling cluster down", iteration)) + By(fmt.Sprintf("iteration %d/%d: scaling cluster down", i, chaosIterations)) downSize := int32(rng.Intn(3) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, downSize)).To(Succeed()) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, downSize)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to scale cluster down to %d", i, chaosIterations, downSize)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: cluster did not become ready after scaling down", i, chaosIterations)) } + By("stopping k6 load") + stopK6Load(namespace, k6DepName) + By("verifying final cluster state") verifyClusterHealthy(namespace, clusterName) - verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) - return k6JobName + return k6DepName } // runOperatorDeletionChaos runs the operator-pod-deletion scenario. func runOperatorDeletionChaos(rng *rand.Rand, namespace, clusterName string) string { - By("starting k6 load job") - k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + By("starting k6 load deployment") + k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) - By("executing chaos with operator deletion") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) + By(fmt.Sprintf("executing chaos with operator deletion (%d iterations)", chaosIterations)) - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + for i := 1; i <= chaosIterations; i++ { + GinkgoWriter.Printf("=== Chaos iteration %d/%d ===\n", i, chaosIterations) - By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: deleting operator pod", i, chaosIterations)) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to delete operator pods", i, chaosIterations)) - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) + Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: cluster did not recover after operator deletion", i, chaosIterations)) // Rate limit between iterations time.Sleep(chaosRateLimitDelay) } + By("stopping k6 load") + stopK6Load(namespace, k6DepName) + By("verifying final cluster state") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) - verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) - return k6JobName + return k6DepName } // runRobinDeletionChaos runs the robin-pod-deletion scenario. func runRobinDeletionChaos(rng *rand.Rand, namespace, clusterName string) string { - By("starting k6 load job") - k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + By("starting k6 load deployment") + k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) - By("executing chaos with robin deletion") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) + By(fmt.Sprintf("executing chaos with robin deletion (%d iterations)", chaosIterations)) - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Chaos iteration %d ===\n", iteration) + for i := 1; i <= chaosIterations; i++ { + GinkgoWriter.Printf("=== Chaos iteration %d/%d ===\n", i, chaosIterations) - By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: deleting robin pods", i, chaosIterations)) + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to delete robin pods", i, chaosIterations)) - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deletedRedis).NotTo(BeEmpty(), "expected at least one redis pod deletion") + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) + Expect(deletedRedis).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: cluster did not recover after robin deletion", i, chaosIterations)) // Rate limit between iterations time.Sleep(chaosRateLimitDelay) } + By("stopping k6 load") + stopK6Load(namespace, k6DepName) + By("verifying final cluster state") Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) - verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) - return k6JobName + return k6DepName } // runFullChaos runs the full chaos scenario with random operator, robin, redis, // and scaling actions. func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { - By("starting k6 load job") - k6JobName := startK6OrFail(namespace, clusterName, chaosDuration, defaultVUs) + By("starting k6 load deployment") + k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) - By("executing full chaos") - endTime := time.Now().Add(chaosDuration - chaosReserveTime) + By(fmt.Sprintf("executing full chaos (%d iterations)", chaosIterations)) - iteration := 0 - for time.Now().Before(endTime) { - iteration++ - GinkgoWriter.Printf("=== Full chaos iteration %d ===\n", iteration) + for i := 1; i <= chaosIterations; i++ { + GinkgoWriter.Printf("=== Full chaos iteration %d/%d ===\n", i, chaosIterations) action := rng.Intn(4) switch action { case 0: - By(fmt.Sprintf("iteration %d: deleting operator pod", iteration)) - Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: deleting operator pod", i, chaosIterations)) + Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to delete operator pods", i, chaosIterations)) case 1: - By(fmt.Sprintf("iteration %d: deleting robin pods", iteration)) - Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: deleting robin pods", i, chaosIterations)) + Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to delete robin pods", i, chaosIterations)) case 2: - By(fmt.Sprintf("iteration %d: deleting random redis pods", iteration)) + By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) - Expect(err).NotTo(HaveOccurred()) - Expect(deleted).NotTo(BeEmpty(), "expected at least one redis pod deletion") + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) + Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) case 3: - By(fmt.Sprintf("iteration %d: scaling cluster", iteration)) + By(fmt.Sprintf("iteration %d/%d: scaling cluster", i, chaosIterations)) newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) - Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed()) + Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: failed to scale cluster to %d", i, chaosIterations, newSize)) } - By(fmt.Sprintf("iteration %d: waiting for recovery", iteration)) - Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed()) + By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) + Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: cluster did not recover after chaos action", i, chaosIterations)) // Rate limit between chaos actions time.Sleep(chaosIterationDelay) } + By("stopping k6 load") + stopK6Load(namespace, k6DepName) + By("verifying final cluster state") verifyClusterHealthy(namespace, clusterName) - verifyK6Completed(namespace, k6JobName, chaosDuration+k6CompletionBuffer) - return k6JobName + return k6DepName } diff --git a/test/chaos/suite_test.go b/test/chaos/suite_test.go index cdee726..75100a9 100644 --- a/test/chaos/suite_test.go +++ b/test/chaos/suite_test.go @@ -24,7 +24,7 @@ var ( dynamicClient dynamic.Interface ctx context.Context cancel context.CancelFunc - chaosDuration time.Duration + chaosIterations int chaosSeed int64 chaosReadyTimeout = 10 * time.Minute skipDeleteNamespace bool @@ -72,7 +72,7 @@ var _ = SynchronizedBeforeSuite( ctx, cancel = context.WithCancel(context.Background()) - chaosDuration = parseDuration(os.Getenv("CHAOS_DURATION"), 10*time.Minute) + chaosIterations = parseInt(os.Getenv("CHAOS_ITERATIONS"), 3) if seedStr := os.Getenv("CHAOS_SEED"); seedStr != "" { seed, err := strconv.ParseInt(seedStr, 10, 64) @@ -89,7 +89,7 @@ var _ = SynchronizedBeforeSuite( skipDeleteNamespace = true } - GinkgoWriter.Printf("Chaos test configuration: duration=%v, seed=%d, skipDeleteNamespace=%v\n", chaosDuration, chaosSeed, skipDeleteNamespace) + GinkgoWriter.Printf("Chaos test configuration: iterations=%d, seed=%d, skipDeleteNamespace=%v\n", chaosIterations, chaosSeed, skipDeleteNamespace) }, ) @@ -106,14 +106,14 @@ var _ = SynchronizedAfterSuite( }, ) -// parseDuration parses a duration string and returns a default if parsing fails. -func parseDuration(s string, defaultVal time.Duration) time.Duration { +// parseInt parses an integer string and returns a default if parsing fails. +func parseInt(s string, defaultVal int) int { if s == "" { return defaultVal } - d, err := time.ParseDuration(s) + v, err := strconv.Atoi(s) if err != nil { return defaultVal } - return d + return v } From 2c6ac65ed84609f52c12bde065a7c5df519ca730 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Fri, 27 Mar 2026 12:16:37 +0100 Subject: [PATCH 16/20] test: improve chaos test --- Makefile | 2 +- test/chaos/chaos_suite_test.go | 4 ++-- test/chaos/framework/readiness.go | 9 ++++++++- test/chaos/framework/redis_chaos.go | 2 ++ test/chaos/helpers_test.go | 17 +++++++++++++---- 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 921fa8a..9fee15b 100644 --- a/Makefile +++ b/Makefile @@ -592,7 +592,7 @@ test-e2e-cov: process-manifests-crd ginkgo ## Execute e2e application test with ##@ Chaos Testing K6_IMG ?= localhost:5001/redkey-k6:dev -CHAOS_ITERATIONS ?= 3 +CHAOS_ITERATIONS ?= 5 CHAOS_SEED ?= CHAOS_TIMEOUT ?= 60m CHAOS_PACKAGES ?= ./test/chaos diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 7f935e1..53ee260 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -91,7 +91,7 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=true)", Label("chaos", // PurgeKeysOnRebalance=true --> the StatefulSet is recreated when scaling // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic", func() { - k6DepName = runScalingChaos(rng, namespace.Name, clusterName) + k6DepName = runScalingChaos(rng, namespace.Name, clusterName, true) }) // ================================================================================== @@ -174,7 +174,7 @@ var _ = Describe("Chaos Under Load (PurgeKeysOnRebalance=false)", Label("chaos", // Scenario 1 (NoPurge): Continuous Scaling Under Load and Chaos // ================================================================================== It("survives continuous scaling and pod deletion while handling traffic without purge", func() { - k6DepName = runScalingChaos(rng, namespace.Name, clusterName) + k6DepName = runScalingChaos(rng, namespace.Name, clusterName, false) }) // ================================================================================== diff --git a/test/chaos/framework/readiness.go b/test/chaos/framework/readiness.go index bb11378..89446dc 100644 --- a/test/chaos/framework/readiness.go +++ b/test/chaos/framework/readiness.go @@ -65,7 +65,14 @@ func WaitForChaosReady(ctx context.Context, dc dynamic.Interface, clientset kube return false, nil } - // 3. For each running pod, verify cluster health + // 3. Verify the pod count matches what the spec expects. + // This prevents a false-positive Ready when the operator hasn't yet + // processed a spec.primaries change (race between CR update and reconcile). + if len(pods.Items) != cluster.Spec.NodesNeeded() { + return false, nil + } + + // 4. For each running pod, verify cluster health for _, pod := range pods.Items { if pod.Status.Phase != corev1.PodRunning { return false, nil diff --git a/test/chaos/framework/redis_chaos.go b/test/chaos/framework/redis_chaos.go index 8e0e3f2..5cd35cd 100644 --- a/test/chaos/framework/redis_chaos.go +++ b/test/chaos/framework/redis_chaos.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/onsi/ginkgo/v2" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" @@ -116,6 +117,7 @@ func ScaleCluster(ctx context.Context, dc dynamic.Interface, namespace, clusterN // keep polling—the operator will eventually reconcile the cluster back // to Ready status. if isNotReadyValidationError(err) { + ginkgo.GinkgoWriter.Printf("ScaleCluster %s/%s to %d primaries: blocked by validation (not Ready): %v\n", namespace, clusterName, primaries, err) return false, nil } return false, err diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index 81daba4..a16e567 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -49,7 +49,11 @@ func verifyClusterHealthy(namespace, clusterName string) { // and returns the k6 deployment name so the caller can clean it up. // runScalingChaos runs the continuous-scaling-and-pod-deletion scenario. -func runScalingChaos(rng *rand.Rand, namespace, clusterName string) string { +// When purgeKeysOnRebalance is true the operator deletes and recreates the +// StatefulSet on scaling, so we must wait for the new StatefulSet to +// acknowledge the target replica count before proceeding. When false the +// StatefulSet is updated in place and the acknowledgment step is skipped. +func runScalingChaos(rng *rand.Rand, namespace, clusterName string, purgeKeysOnRebalance bool) string { By("starting k6 load deployment") k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) @@ -63,8 +67,13 @@ func runScalingChaos(rng *rand.Rand, namespace, clusterName string) string { Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to scale cluster up to %d", i, chaosIterations, newSize)) - Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed(), - fmt.Sprintf("iteration %d/%d: StatefulSet did not acknowledge scale to %d", i, chaosIterations, newSize)) + // When purge is enabled the operator deletes and recreates the + // StatefulSet, so we must wait for the new one to appear with the + // correct replica count before interacting with pods. + if purgeKeysOnRebalance { + Expect(framework.WaitForScaleAck(ctx, k8sClientset, namespace, clusterName, newSize, scaleAckTimeout, scalePollInterval)).To(Succeed(), + fmt.Sprintf("iteration %d/%d: StatefulSet did not acknowledge scale to %d", i, chaosIterations, newSize)) + } By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deleteCount := rng.Intn(int(newSize)/2) + 1 @@ -78,7 +87,7 @@ func runScalingChaos(rng *rand.Rand, namespace, clusterName string) string { fmt.Sprintf("iteration %d/%d: cluster did not recover after pod deletion", i, chaosIterations)) By(fmt.Sprintf("iteration %d/%d: scaling cluster down", i, chaosIterations)) - downSize := int32(rng.Intn(3) + minPrimaries) + downSize := int32(minPrimaries - rng.Intn(3)) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, downSize)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to scale cluster down to %d", i, chaosIterations, downSize)) From 1ac1b298d3125f40fb87209aa51ae27bf7ab971a Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Fri, 27 Mar 2026 12:57:52 +0100 Subject: [PATCH 17/20] test: improve chaos test --- test/chaos/helpers_test.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index a16e567..93854aa 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -176,8 +176,10 @@ func runRobinDeletionChaos(rng *rand.Rand, namespace, clusterName string) string return k6DepName } -// runFullChaos runs the full chaos scenario with random operator, robin, redis, -// and scaling actions. +// runFullChaos runs the full chaos scenario firing multiple actions per +// iteration — operator deletion, robin deletion, redis pod deletion, and +// scaling — without waiting for recovery between them. This tests the +// operator's ability to heal from accumulated, overlapping failures. func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { By("starting k6 load deployment") k6DepName := startK6OrFail(namespace, clusterName, defaultVUs) @@ -187,23 +189,29 @@ func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { for i := 1; i <= chaosIterations; i++ { GinkgoWriter.Printf("=== Full chaos iteration %d/%d ===\n", i, chaosIterations) - action := rng.Intn(4) + // Each action is independently chosen so multiple (or all) can fire + // in the same iteration, accumulating failures before recovery. - switch action { - case 0: + if rng.Intn(2) == 0 { By(fmt.Sprintf("iteration %d/%d: deleting operator pod", i, chaosIterations)) Expect(framework.DeleteOperatorPods(ctx, k8sClientset, namespace)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to delete operator pods", i, chaosIterations)) - case 1: + } + + if rng.Intn(2) == 0 { By(fmt.Sprintf("iteration %d/%d: deleting robin pods", i, chaosIterations)) Expect(framework.DeleteRobinPods(ctx, k8sClientset, namespace, clusterName)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to delete robin pods", i, chaosIterations)) - case 2: + } + + if rng.Intn(2) == 0 { By(fmt.Sprintf("iteration %d/%d: deleting random redis pods", i, chaosIterations)) deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) - case 3: + } + + if rng.Intn(2) == 0 { By(fmt.Sprintf("iteration %d/%d: scaling cluster", i, chaosIterations)) newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), @@ -212,9 +220,9 @@ func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), - fmt.Sprintf("iteration %d/%d: cluster did not recover after chaos action", i, chaosIterations)) + fmt.Sprintf("iteration %d/%d: cluster did not recover after chaos actions", i, chaosIterations)) - // Rate limit between chaos actions + // Rate limit between chaos iterations time.Sleep(chaosIterationDelay) } From 8ff9d12bc46fd3c45be08d4e791cf751abe31c24 Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Fri, 27 Mar 2026 14:13:10 +0100 Subject: [PATCH 18/20] test: improve chaos test --- test/chaos/helpers_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index 93854aa..c5d9827 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -122,6 +122,7 @@ func runOperatorDeletionChaos(rng *rand.Rand, namespace, clusterName string) str deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) + GinkgoWriter.Printf("Deleted pods: %v\n", deleted) By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), @@ -158,6 +159,7 @@ func runRobinDeletionChaos(rng *rand.Rand, namespace, clusterName string) string deletedRedis, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) Expect(deletedRedis).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) + GinkgoWriter.Printf("Deleted pods: %v\n", deletedRedis) By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), @@ -209,6 +211,7 @@ func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { deleted, err := framework.DeleteRandomRedisPods(ctx, k8sClientset, namespace, clusterName, 2, rng) Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to delete random redis pods", i, chaosIterations)) Expect(deleted).NotTo(BeEmpty(), fmt.Sprintf("iteration %d/%d: expected at least one redis pod deletion", i, chaosIterations)) + GinkgoWriter.Printf("Deleted pods: %v\n", deleted) } if rng.Intn(2) == 0 { From c4b7d73df773e42d5562163d9b8bf6b2f5173f9a Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Sat, 28 Mar 2026 17:02:37 +0100 Subject: [PATCH 19/20] test: improve chaos test --- Makefile | 5 ++-- test/chaos/chaos_suite_test.go | 9 +++++++ test/chaos/framework/readiness.go | 45 +++++++++++++++++++++++-------- test/chaos/helpers_test.go | 36 ++++++++++++++++++++++++- 4 files changed, 81 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 9fee15b..9e24d27 100644 --- a/Makefile +++ b/Makefile @@ -592,9 +592,10 @@ test-e2e-cov: process-manifests-crd ginkgo ## Execute e2e application test with ##@ Chaos Testing K6_IMG ?= localhost:5001/redkey-k6:dev -CHAOS_ITERATIONS ?= 5 +CHAOS_ITERATIONS ?= 10 CHAOS_SEED ?= -CHAOS_TIMEOUT ?= 60m +# With 10 iterations 60 m is not enough with TEST_PARALLEL_PROCESS=8 export GOMAXPROCS=8 +CHAOS_TIMEOUT ?= 100m CHAOS_PACKAGES ?= ./test/chaos CHAOS_TEST_OUTPUT = .local/chaos-test.json # CHAOS_KEEP_NAMESPACE_ON_FAILED=1 # if != "" skip delete namespace if failed diff --git a/test/chaos/chaos_suite_test.go b/test/chaos/chaos_suite_test.go index 53ee260..08f403d 100644 --- a/test/chaos/chaos_suite_test.go +++ b/test/chaos/chaos_suite_test.go @@ -366,5 +366,14 @@ func collectDiagnostics(namespace string) { GinkgoWriter.Printf("Failed to get redis logs: %v\n", err) } + // Capture robin pod logs + GinkgoWriter.Printf("\n--- Robin Pod Logs (last %d lines) ---\n", diagnosticsLogTail) + robinLogs, err := framework.GetPodLogs(ctx, k8sClientset, namespace, framework.RobinPodsSelector(clusterName), diagnosticsLogTail) + if err == nil { + GinkgoWriter.Printf("%s\n", robinLogs) + } else { + GinkgoWriter.Printf("Failed to get robin logs: %v\n", err) + } + GinkgoWriter.Printf("=== END DIAGNOSTICS ===\n\n") } diff --git a/test/chaos/framework/readiness.go b/test/chaos/framework/readiness.go index 89446dc..757a4a4 100644 --- a/test/chaos/framework/readiness.go +++ b/test/chaos/framework/readiness.go @@ -13,7 +13,6 @@ import ( redkeyv1 "github.com/inditextech/redkeyoperator/api/v1" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" @@ -23,7 +22,6 @@ import ( const ( defaultChaosReadyTimeout = 10 * time.Minute pollInterval = 2 * time.Second - maxConsecutiveErrors = 10 ) // WaitForChaosReady waits for the Redis cluster to be fully healthy. @@ -33,23 +31,27 @@ func WaitForChaosReady(ctx context.Context, dc dynamic.Interface, clientset kube timeout = defaultChaosReadyTimeout } - var consecutiveErrors int - var lastErr error + var lastReason string + err := wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) { + // When the context is cancelled or expired, avoid overwriting + // lastReason so we preserve the real diagnostic from the + // previous poll tick. All API calls below would fail with a + // context error, which is not useful for debugging. + if ctx.Err() != nil { + return false, ctx.Err() + } - return wait.PollUntilContextTimeout(ctx, pollInterval, timeout, true, func(ctx context.Context) (bool, error) { // 1. Check CR status cluster, err := GetRedkeyCluster(ctx, dc, namespace, clusterName) if err != nil { - consecutiveErrors++ - lastErr = err - if !errors.IsNotFound(err) && consecutiveErrors > maxConsecutiveErrors { - return false, fmt.Errorf("persistent error getting cluster (after %d attempts): %w", consecutiveErrors, lastErr) + if ctx.Err() == nil { + lastReason = fmt.Sprintf("error getting cluster: %v", err) } return false, nil } - consecutiveErrors = 0 if cluster.Status.Status != redkeyv1.StatusReady { + lastReason = fmt.Sprintf("CR status is %q (want Ready)", cluster.Status.Status) return false, nil } @@ -58,36 +60,57 @@ func WaitForChaosReady(ctx context.Context, dc dynamic.Interface, clientset kube LabelSelector: RedisPodsSelector(clusterName), }) if err != nil { + if ctx.Err() == nil { + lastReason = fmt.Sprintf("error listing pods: %v", err) + } return false, nil } if len(pods.Items) == 0 { + lastReason = "pod count is 0" return false, nil } // 3. Verify the pod count matches what the spec expects. // This prevents a false-positive Ready when the operator hasn't yet // processed a spec.primaries change (race between CR update and reconcile). - if len(pods.Items) != cluster.Spec.NodesNeeded() { + expected := cluster.Spec.NodesNeeded() + if len(pods.Items) != expected { + lastReason = fmt.Sprintf("pod count %d != expected %d (spec.primaries=%d)", len(pods.Items), expected, cluster.Spec.Primaries) return false, nil } // 4. For each running pod, verify cluster health for _, pod := range pods.Items { if pod.Status.Phase != corev1.PodRunning { + lastReason = fmt.Sprintf("pod %s phase is %s (want Running)", pod.Name, pod.Status.Phase) + return false, nil + } + + if ctx.Err() != nil { return false, nil } if !clusterCheckPasses(ctx, namespace, pod.Name) { + if ctx.Err() == nil { + lastReason = fmt.Sprintf("redis-cli --cluster check failed on pod %s", pod.Name) + } return false, nil } if clusterNodesHasFailure(ctx, namespace, pod.Name) { + if ctx.Err() == nil { + lastReason = fmt.Sprintf("cluster nodes failure detected on pod %s", pod.Name) + } return false, nil } } return true, nil }) + if err != nil && lastReason != "" { + return fmt.Errorf("WaitForChaosReady(%s/%s): last check: %s: %w", namespace, clusterName, lastReason, err) + } + return err } // clusterCheckPasses runs redis-cli --cluster check and returns true if it succeeds. diff --git a/test/chaos/helpers_test.go b/test/chaos/helpers_test.go index c5d9827..5d2778c 100644 --- a/test/chaos/helpers_test.go +++ b/test/chaos/helpers_test.go @@ -59,11 +59,14 @@ func runScalingChaos(rng *rand.Rand, namespace, clusterName string, purgeKeysOnR By(fmt.Sprintf("executing chaos loop (%d iterations)", chaosIterations)) + currentPrimaries := int32(defaultPrimaries) + for i := 1; i <= chaosIterations; i++ { GinkgoWriter.Printf("=== Chaos iteration %d/%d ===\n", i, chaosIterations) By(fmt.Sprintf("iteration %d/%d: scaling cluster up", i, chaosIterations)) newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + GinkgoWriter.Printf("Scaling up: %d -> %d primaries\n", currentPrimaries, newSize) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to scale cluster up to %d", i, chaosIterations, newSize)) @@ -86,13 +89,28 @@ func runScalingChaos(rng *rand.Rand, namespace, clusterName string, purgeKeysOnR Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), fmt.Sprintf("iteration %d/%d: cluster did not recover after pod deletion", i, chaosIterations)) + // Verify the cluster spec matches what we scaled to. + cluster, err := framework.GetRedkeyCluster(ctx, dynamicClient, namespace, clusterName) + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to get cluster after scale-up recovery", i, chaosIterations)) + Expect(cluster.Spec.Primaries).To(Equal(newSize), + fmt.Sprintf("iteration %d/%d: expected spec.primaries=%d after scale-up, got %d", i, chaosIterations, newSize, cluster.Spec.Primaries)) + currentPrimaries = newSize + By(fmt.Sprintf("iteration %d/%d: scaling cluster down", i, chaosIterations)) downSize := int32(minPrimaries - rng.Intn(3)) + GinkgoWriter.Printf("Scaling down: %d -> %d primaries\n", currentPrimaries, downSize) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, downSize)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to scale cluster down to %d", i, chaosIterations, downSize)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), fmt.Sprintf("iteration %d/%d: cluster did not become ready after scaling down", i, chaosIterations)) + + // Verify the cluster spec matches what we scaled to. + cluster, err = framework.GetRedkeyCluster(ctx, dynamicClient, namespace, clusterName) + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to get cluster after scale-down recovery", i, chaosIterations)) + Expect(cluster.Spec.Primaries).To(Equal(downSize), + fmt.Sprintf("iteration %d/%d: expected spec.primaries=%d after scale-down, got %d", i, chaosIterations, downSize, cluster.Spec.Primaries)) + currentPrimaries = downSize } By("stopping k6 load") @@ -188,8 +206,12 @@ func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { By(fmt.Sprintf("executing full chaos (%d iterations)", chaosIterations)) + currentPrimaries := int32(defaultPrimaries) + var scaled bool + for i := 1; i <= chaosIterations; i++ { GinkgoWriter.Printf("=== Full chaos iteration %d/%d ===\n", i, chaosIterations) + scaled = false // Each action is independently chosen so multiple (or all) can fire // in the same iteration, accumulating failures before recovery. @@ -214,17 +236,29 @@ func runFullChaos(rng *rand.Rand, namespace, clusterName string) string { GinkgoWriter.Printf("Deleted pods: %v\n", deleted) } + var newSize int32 if rng.Intn(2) == 0 { By(fmt.Sprintf("iteration %d/%d: scaling cluster", i, chaosIterations)) - newSize := int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + newSize = int32(rng.Intn(maxPrimaries-minPrimaries+1) + minPrimaries) + GinkgoWriter.Printf("Scaling: %d -> %d primaries\n", currentPrimaries, newSize) Expect(framework.ScaleCluster(ctx, dynamicClient, namespace, clusterName, newSize)).To(Succeed(), fmt.Sprintf("iteration %d/%d: failed to scale cluster to %d", i, chaosIterations, newSize)) + scaled = true } By(fmt.Sprintf("iteration %d/%d: waiting for recovery", i, chaosIterations)) Expect(framework.WaitForChaosReady(ctx, dynamicClient, k8sClientset, namespace, clusterName, chaosReadyTimeout)).To(Succeed(), fmt.Sprintf("iteration %d/%d: cluster did not recover after chaos actions", i, chaosIterations)) + // Verify the cluster spec matches what we expect after recovery. + if scaled { + cluster, err := framework.GetRedkeyCluster(ctx, dynamicClient, namespace, clusterName) + Expect(err).NotTo(HaveOccurred(), fmt.Sprintf("iteration %d/%d: failed to get cluster after recovery", i, chaosIterations)) + Expect(cluster.Spec.Primaries).To(Equal(newSize), + fmt.Sprintf("iteration %d/%d: expected spec.primaries=%d after scaling, got %d", i, chaosIterations, newSize, cluster.Spec.Primaries)) + currentPrimaries = newSize + } + // Rate limit between chaos iterations time.Sleep(chaosIterationDelay) } From 6197c878bd2e7c157822d76e48b2fca2c41fb59b Mon Sep 17 00:00:00 2001 From: Daniel Dorado Date: Sat, 28 Mar 2026 22:50:02 +0100 Subject: [PATCH 20/20] test: improve chaos test --- Makefile | 4 +- .../redkey.inditex.dev_redkeyclusters.yaml | 2 + docs/developer-guide/development-guide.md | 59 ++++++++++++++++++ docs/proposals/e2e-chaos-test-design.md | 62 ++++++++++++++----- scripts/finalizers-clean.sh | 4 ++ scripts/get-robin-status.sh | 4 ++ scripts/remove-test-namespaces.sh | 4 ++ scripts/report-test.py | 4 ++ test/chaos/k6scripts/test-300k.js | 4 ++ test/chaos/suite_test.go | 2 +- 10 files changed, 130 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index 9e24d27..aeff2ac 100644 --- a/Makefile +++ b/Makefile @@ -559,8 +559,8 @@ ginkgo: $(GO) install github.com/onsi/ginkgo/v2/ginkgo -TEST_PARALLEL_PROCESS ?= 4 -GOMAXPROCS ?= 4 +TEST_PARALLEL_PROCESS ?= 8 +GOMAXPROCS ?= 8 REDIS_IMAGE ?= redis:8.4.0 CHANGED_REDIS_IMAGE ?= redis:8.2.3 diff --git a/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml b/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml index 36d269a..151706d 100644 --- a/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml +++ b/config/crd/bases/redkey.inditex.dev_redkeyclusters.yaml @@ -1,3 +1,5 @@ +# SPDX-FileCopyrightText: 2025 INDUSTRIA DE DISEÑO TEXTIL S.A. (INDITEX S.A.) +# SPDX-License-Identifier: Apache-2.0 --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition diff --git a/docs/developer-guide/development-guide.md b/docs/developer-guide/development-guide.md index d4bf214..fc01793 100644 --- a/docs/developer-guide/development-guide.md +++ b/docs/developer-guide/development-guide.md @@ -160,6 +160,65 @@ Or run only an E2E test: make test-e2e GINKGO_EXTRA_OPTS='--focus="sets and clears custom labels"' ``` +### Chaos tests + +Chaos tests validate operator resilience under disruptive conditions: random pod +deletions, scaling, operator restarts, and topology corruption — all while the +cluster is under k6 write/read load. They live in `test/chaos/` and run via: + +```shell +make test-chaos +``` + +Run a single scenario: + +```shell +make test-chaos GINKGO_EXTRA_OPTS='--focus="survives continuous scaling"' +``` + +#### Chaos test environment variables + +The chaos suite runs multiple Ginkgo processes in parallel, each creating its +own isolated namespace with an operator, Robin, and a Redis cluster. The +following variables control test behavior and should be set in your shell or +`.envrc` before running `make test-chaos`: + +```shell +export TEST_PARALLEL_PROCESS=8 +export GOMAXPROCS=8 +export CHAOS_KEEP_NAMESPACE_ON_FAILED=true +export IMG_ROBIN=localhost:5001/redkey-robin:0.1.0 +``` + +| Variable | Default | Description | +|---------------------------------|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `TEST_PARALLEL_PROCESS` | `8` | Number of parallel Ginkgo processes (`-procs`). Each process runs a separate test spec in its own namespace. Higher values run more specs concurrently but require more cluster resources. This also applies to E2E tests. | +| `GOMAXPROCS` | `8` | Go runtime parallelism. Should match `TEST_PARALLEL_PROCESS` so each Ginkgo process has a dedicated OS thread. Setting this lower than `TEST_PARALLEL_PROCESS` causes goroutine contention; setting it higher wastes CPU without benefit. | +| `CHAOS_ITERATIONS` | `10` | Number of chaos loop iterations per test spec. Each iteration performs disruptive actions (scale, delete pods, etc.) and waits for recovery. More iterations increase coverage but extend the total run time proportionally. | +| `CHAOS_TIMEOUT` | `100m` | Maximum wall-clock time Ginkgo allows for the entire chaos suite (`--timeout`). Must be large enough to accommodate `CHAOS_ITERATIONS` x recovery time x number of specs / `TEST_PARALLEL_PROCESS`. With 10 iterations and 8 parallel processes, 100 minutes is typically sufficient. | +| `CHAOS_SEED` | *(auto: Ginkgo random seed)* | Fixed random seed for reproducibility. When a chaos run fails, the seed is printed in the output so you can replay the exact sequence of random actions. | +| `CHAOS_KEEP_NAMESPACE_ON_FAILED`| *(unset)* | When set to any non-empty value, failed test namespaces are preserved instead of deleted. This allows post-mortem inspection of pods, logs, and cluster state with `kubectl`. Remember to clean up namespaces manually afterwards. | +| `IMG_ROBIN` | `ghcr.io/inditextech/redkey-robin:$(ROBIN_VERSION)` | Robin sidecar image. For local development, point this to your local registry (e.g. `localhost:5001/redkey-robin:0.1.0`). Passed to tests as `ROBIN_IMAGE` via `GINKGO_ENV`. | +| `K6_IMG` | `localhost:5001/redkey-k6:dev` | k6 load generator image (built with xk6-redis extension). Build it with `make k6-build` before running chaos tests. | + +#### Relationship between parallelism and timeouts + +`TEST_PARALLEL_PROCESS` controls how many test specs run concurrently. The chaos +suite has 8 specs (4 scenarios x 2 `purgeKeysOnRebalance` modes), so with +`TEST_PARALLEL_PROCESS=8` all specs run in parallel and the total wall-clock +time equals roughly the duration of the slowest single spec. + +`CHAOS_TIMEOUT` must account for the worst case: if one spec takes longer than +expected (e.g. slow recovery after a scale-down), the timeout must be generous +enough to avoid killing a spec mid-recovery. As a rule of thumb: + +- With `CHAOS_ITERATIONS=10` and `TEST_PARALLEL_PROCESS=8`: `CHAOS_TIMEOUT=100m` +- With `CHAOS_ITERATIONS=5` and `TEST_PARALLEL_PROCESS=4`: `CHAOS_TIMEOUT=60m` + +`GOMAXPROCS` should always match `TEST_PARALLEL_PROCESS`. Each Ginkgo process +creates its own Kubernetes clients with independent rate limiters (QPS=5, +Burst=10), so they don't contend on API access — but they do share CPU. + ## How to test the operator with CRC and operator-sdk locally (OLM deployment) These commands allow us to deploy with OLM the Redkey Operator in a OC cluster in local environment diff --git a/docs/proposals/e2e-chaos-test-design.md b/docs/proposals/e2e-chaos-test-design.md index b865498..5512009 100644 --- a/docs/proposals/e2e-chaos-test-design.md +++ b/docs/proposals/e2e-chaos-test-design.md @@ -1,3 +1,9 @@ + + # E2E Chaos Test Design ## Overview @@ -263,13 +269,13 @@ func WaitForChaosReady(ctx context.Context, client client.Client, namespace, clu ## 6. Usage Examples -| Command | Purpose | -|--------------------------------------------------|----------------------------------| -| `make test-chaos` | Run all chaos tests (sequential) | -| `make test-chaos-focus FOCUS="Continuous Chaos"` | Run single scenario | -| `make test-chaos CHAOS_DURATION=5m` | Short chaos duration | -| `make test-chaos CHAOS_SEED=12345` | Reproducible random seed | -| `make k6-build` | Build k6 image only | +| Command | Purpose | +|-------------------------------------------------------------------------------|-----------------------------| +| `make test-chaos` | Run all chaos tests | +| `make test-chaos GINKGO_EXTRA_OPTS='--focus="survives continuous scaling"'` | Run single scenario | +| `make test-chaos CHAOS_ITERATIONS=3` | Fewer iterations per spec | +| `make test-chaos CHAOS_SEED=12345` | Reproducible random seed | +| `make k6-build` | Build k6 image only | --- @@ -475,15 +481,39 @@ func verifyK6Completed(c client.Client, namespace, jobName string, timeout time. ## 12. Environment Variables -| Variable | Default | Purpose | -|------------------|--------------------------------|----------------------------------------| -| `K6_IMG` | `localhost:5001/redkey-k6:dev` | k6 container image | -| `CHAOS_TIMEOUT` | `30m` | Maximum Ginkgo test timeout | -| `CHAOS_DURATION` | `10m` | k6 load duration / chaos loop duration | -| `CHAOS_SEED` | (auto) | Random seed for reproducibility | -| `CHAOS_VUS` | `10` | k6 virtual users | -| `OPERATOR_IMAGE` | From main Makefile | Operator image for tests | -| `ROBIN_IMAGE` | From main Makefile | Robin image for tests | +### Makefile variables (set via `make` or exported in shell / `.envrc`) + +| Variable | Default | Purpose | +|--------------------------|------------------------------------------------------|----------------------------------------------------------------------------------------------------------------| +| `TEST_PARALLEL_PROCESS` | `8` | Number of parallel Ginkgo processes (`-procs`). Applies to both E2E and chaos tests. | +| `GOMAXPROCS` | `8` | Go runtime parallelism. Should match `TEST_PARALLEL_PROCESS`. | +| `K6_IMG` | `localhost:5001/redkey-k6:dev` | k6 container image with xk6-redis extension. | +| `CHAOS_ITERATIONS` | `10` | Number of chaos loop iterations per test spec. | +| `CHAOS_SEED` | *(auto: Ginkgo random seed)* | Fixed random seed for reproducibility. | +| `CHAOS_TIMEOUT` | `100m` | Maximum Ginkgo suite timeout (`--timeout`). Must accommodate iterations x recovery time / parallelism. | +| `CHAOS_PACKAGES` | `./test/chaos` | Go test packages for the chaos suite. | +| `IMG` | `localhost:5001/redkey-operator:$(VERSION)` | Operator image. Passed to tests as `OPERATOR_IMAGE`. | +| `IMG_ROBIN` | `ghcr.io/inditextech/redkey-robin:$(ROBIN_VERSION)` | Robin image. Passed to tests as `ROBIN_IMAGE`. | +| `REDIS_IMAGE` | `redis:8.4.0` | Redis image used for cluster pods. | +| `GINKGO_EXTRA_OPTS` | *(empty)* | Extra Ginkgo flags (e.g. `--focus="..."`, `--label-filter=...`). | + +### Test-only environment variables (read by Go code, not in Makefile defaults) + +| Variable | Default | Purpose | +|-----------------------------------|---------------------------|------------------------------------------------------------------------------------------| +| `CHAOS_KEEP_NAMESPACE_ON_FAILED` | *(unset)* | When non-empty, preserves failed test namespaces for post-mortem inspection. | +| `KUBECONFIG` | `~/.kube/config` | Path to kubeconfig file. | +| `OPERATOR_IMAGE` | `localhost:5001/redkey-operator:dev` | Operator image (set automatically by `GINKGO_ENV` from `IMG`). | +| `ROBIN_IMAGE` | `localhost:5001/redkey-robin:dev` | Robin image (set automatically by `GINKGO_ENV` from `IMG_ROBIN`). | + +### Recommended `.envrc` for local development + +```shell +export TEST_PARALLEL_PROCESS=8 +export GOMAXPROCS=8 +export CHAOS_KEEP_NAMESPACE_ON_FAILED=true +export IMG_ROBIN=localhost:5001/redkey-robin:0.1.0 +``` --- diff --git a/scripts/finalizers-clean.sh b/scripts/finalizers-clean.sh index f2e7ea4..f7dcd40 100755 --- a/scripts/finalizers-clean.sh +++ b/scripts/finalizers-clean.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 INDUSTRIA DE DISENO TEXTIL S.A. (INDITEX S.A.) +# +# SPDX-License-Identifier: Apache-2.0 + # clean-finalizers.sh # Find and remove finalizers from all namespaced resources in a namespace. # Useful when an operator/controller was removed and CRs are stuck terminating. diff --git a/scripts/get-robin-status.sh b/scripts/get-robin-status.sh index f54c87e..008a229 100755 --- a/scripts/get-robin-status.sh +++ b/scripts/get-robin-status.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 INDUSTRIA DE DISENO TEXTIL S.A. (INDITEX S.A.) +# +# SPDX-License-Identifier: Apache-2.0 + # Exit on errors set -o errexit set -o pipefail diff --git a/scripts/remove-test-namespaces.sh b/scripts/remove-test-namespaces.sh index c230d9b..ed7547a 100755 --- a/scripts/remove-test-namespaces.sh +++ b/scripts/remove-test-namespaces.sh @@ -1,4 +1,8 @@ #!/usr/bin/env bash +# SPDX-FileCopyrightText: 2026 INDUSTRIA DE DISENO TEXTIL S.A. (INDITEX S.A.) +# +# SPDX-License-Identifier: Apache-2.0 + # Exit on errors set -o errexit set -o pipefail diff --git a/scripts/report-test.py b/scripts/report-test.py index a57c6b5..dabfa48 100755 --- a/scripts/report-test.py +++ b/scripts/report-test.py @@ -1,4 +1,8 @@ #!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2026 INDUSTRIA DE DISENO TEXTIL S.A. (INDITEX S.A.) +# +# SPDX-License-Identifier: Apache-2.0 + """Convert Ginkgo JSON test results to Markdown format.""" import json diff --git a/test/chaos/k6scripts/test-300k.js b/test/chaos/k6scripts/test-300k.js index 97f318b..5d3d09d 100644 --- a/test/chaos/k6scripts/test-300k.js +++ b/test/chaos/k6scripts/test-300k.js @@ -1,3 +1,7 @@ +// SPDX-FileCopyrightText: 2026 INDUSTRIA DE DISENO TEXTIL S.A. (INDITEX S.A.) +// +// SPDX-License-Identifier: Apache-2.0 + /* Test that imitates a redis-cluster that was causing rebalancing errors: - Value Size: 1Byte - 300KBytes diff --git a/test/chaos/suite_test.go b/test/chaos/suite_test.go index 75100a9..4d98703 100644 --- a/test/chaos/suite_test.go +++ b/test/chaos/suite_test.go @@ -26,7 +26,7 @@ var ( cancel context.CancelFunc chaosIterations int chaosSeed int64 - chaosReadyTimeout = 10 * time.Minute + chaosReadyTimeout = 15 * time.Minute // if it is scaling 1 to 7, 3 pods deleted, k6 load, it was bigger than 10 skipDeleteNamespace bool )