redis · ndyakov · Dec 1, 2025 · Dec 2, 2025 · Dec 4, 2025 · Dec 4, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [master, v9, 'v9.*']
   pull_request:
-    branches: [master, v9, v9.7, v9.8, 'ndyakov/*', 'ofekshenawa/*', 'htemelski-redis/*', 'ce/*']
+    branches: [master, v9, v9.7, v9.8, 'ndyakov/**', 'ofekshenawa/**', 'ce/**']
 
 permissions:
   contents: read

diff --git a/.github/workflows/test-e2e.yml b/.github/workflows/test-e2e.yml
@@ -0,0 +1,63 @@
+name: E2E Tests
+
+on:
+  push:
+    branches: [master, v9, 'v9.*']
+  pull_request:
+    branches: [master, v9, v9.7, v9.8, 'ndyakov/**', 'ofekshenawa/**', 'ce/**']
+
+permissions:
+  contents: read
+
+jobs:
+  test-e2e-mock:
+    name: E2E Tests (Mock Proxy)
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        go-version:
+          - "1.23.x"
+          - stable
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v6
+
+      - name: Set up Go ${{ matrix.go-version }}
+        uses: actions/setup-go@v6
+        with:
+          go-version: ${{ matrix.go-version }}
+
+      - name: Start Docker services for E2E tests
+        run: make docker.e2e.start
+
+      - name: Wait for services to be ready
+        run: |
+          echo "Waiting for Redis to be ready..."
+          timeout 30 bash -c 'until docker exec redis-standalone redis-cli ping 2>/dev/null; do sleep 1; done'
+          echo "Waiting for cae-resp-proxy to be ready..."
+          timeout 30 bash -c 'until curl -s http://localhost:18100/stats > /dev/null; do sleep 1; done'
+          echo "All services are ready!"
+
+      - name: Run E2E tests with mock proxy
+        env:
+          E2E_SCENARIO_TESTS: "true"
+        run: |
+          go test -v ./maintnotifications/e2e/ -timeout 30m -race
+        continue-on-error: false
+
+      - name: Stop Docker services
+        if: always()
+        run: make docker.e2e.stop
+
+      - name: Show Docker logs on failure
+        if: failure()
+        run: |
+          echo "=== Redis logs ==="
+          docker logs redis-standalone 2>&1 | tail -100
+          echo "=== cae-resp-proxy logs ==="
+          docker logs cae-resp-proxy 2>&1 | tail -100
+          echo "=== proxy-fault-injector logs ==="
+          docker logs proxy-fault-injector 2>&1 | tail -100
+
diff --git a/Makefile b/Makefile
@@ -14,6 +14,17 @@ docker.start:
 docker.stop:
 	docker compose --profile all down
 
+docker.e2e.start:
+	@echo "Starting Redis and cae-resp-proxy for E2E tests..."
+	docker compose --profile e2e up -d --quiet-pull
+	@echo "Waiting for services to be ready..."
+	@sleep 3
+	@echo "Services ready!"
+
+docker.e2e.stop:
+	@echo "Stopping E2E services..."
+	docker compose --profile e2e down
+
 test:
 	$(MAKE) docker.start
 	@if [ -z "$(REDIS_VERSION)" ]; then \
@@ -66,7 +77,31 @@ bench:
 	export REDIS_VERSION=$(REDIS_VERSION) && \
 	go test ./... -test.run=NONE -test.bench=. -test.benchmem -skip Example
 
-.PHONY: all test test.ci test.ci.skip-vectorsets bench fmt
+test.e2e:
+	@echo "Running E2E tests with auto-start proxy..."
+	$(MAKE) docker.e2e.start
+	@echo "Running tests..."
+	@E2E_SCENARIO_TESTS=true go test -v ./maintnotifications/e2e/ -timeout 30m || ($(MAKE) docker.e2e.stop && exit 1)
+	$(MAKE) docker.e2e.stop
+	@echo "E2E tests completed!"
+
+test.e2e.docker:
+	@echo "Running Docker-compatible E2E tests..."
+	$(MAKE) docker.e2e.start
+	@echo "Running unified injector tests..."
+	@E2E_SCENARIO_TESTS=true go test -v -run "TestUnifiedInjector|TestCreateTestFaultInjectorLogic|TestFaultInjectorClientCreation" ./maintnotifications/e2e/ -timeout 10m || ($(MAKE) docker.e2e.stop && exit 1)
+	$(MAKE) docker.e2e.stop
+	@echo "Docker E2E tests completed!"
+
+test.e2e.logic:
+	@echo "Running E2E logic tests (no proxy required)..."
+	@E2E_SCENARIO_TESTS=true \
+		REDIS_ENDPOINTS_CONFIG_PATH=/tmp/test_endpoints_verify.json \
+		FAULT_INJECTION_API_URL=http://localhost:8080 \
+		go test -v -run "TestCreateTestFaultInjectorLogic|TestFaultInjectorClientCreation" ./maintnotifications/e2e/
+	@echo "Logic tests completed!"
+
+.PHONY: all test test.ci test.ci.skip-vectorsets bench fmt test.e2e test.e2e.logic docker.e2e.start docker.e2e.stop
 
 build:
 	export RE_CLUSTER=$(RE_CLUSTER) && \

diff --git a/commands_test.go b/commands_test.go
@@ -8922,27 +8922,37 @@ var _ = Describe("Commands", func() {
 			const key = "latency-monitor-threshold"
 
 			old := client.ConfigGet(ctx, key).Val()
-			client.ConfigSet(ctx, key, "1")
+			// Use a higher threshold (100ms) to avoid capturing normal operations
+			// that could cause flakiness due to timing variations
+			client.ConfigSet(ctx, key, "100")
 			defer client.ConfigSet(ctx, key, old[key])
 
 			result, err := client.Latency(ctx).Result()
 			Expect(err).NotTo(HaveOccurred())
 			Expect(len(result)).Should(Equal(0))
 
-			err = client.Do(ctx, "DEBUG", "SLEEP", 0.01).Err()
+			// Use a longer sleep (150ms) to ensure it exceeds the 100ms threshold
+			err = client.Do(ctx, "DEBUG", "SLEEP", 0.15).Err()
 			Expect(err).NotTo(HaveOccurred())
 
 			result, err = client.Latency(ctx).Result()
 			Expect(err).NotTo(HaveOccurred())
-			Expect(len(result)).Should(Equal(1))
+			Expect(len(result)).Should(BeNumerically(">=", 1))
 
 			// reset latency by event name
-			err = client.LatencyReset(ctx, result[0].Name).Err()
+			eventName := result[0].Name
+			err = client.LatencyReset(ctx, eventName).Err()
 			Expect(err).NotTo(HaveOccurred())
 
+			// Verify the specific event was reset (not that all events are gone)
+			// This avoids flakiness from other operations triggering latency events
 			result, err = client.Latency(ctx).Result()
 			Expect(err).NotTo(HaveOccurred())
-			Expect(len(result)).Should(Equal(0))
+			for _, event := range result {
+				if event.Name == eventName {
+					Fail("Event " + eventName + " should have been reset")
+				}
+			}
 		})
 	})
 })

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -21,6 +21,7 @@ services:
       - sentinel
       - all-stack
       - all
+      - e2e
 
   osscluster:
     image: ${CLIENT_LIBS_TEST_IMAGE:-redislabs/client-libs-test:8.4.0}
@@ -39,6 +40,43 @@ services:
       - all-stack
       - all
 
+  cae-resp-proxy:
+    image: redislabs/client-resp-proxy:latest
+    container_name: cae-resp-proxy
+    environment:
+      - TARGET_HOST=redis
+      - TARGET_PORT=6379
+      - LISTEN_PORT=17000,17001,17002,17003  # 4 proxy nodes: initially show 3, swap in 4th during SMIGRATED
+      - LISTEN_HOST=0.0.0.0
+      - API_PORT=3000
+      - DEFAULT_INTERCEPTORS=cluster,hitless
+    ports:
+      - "17000:17000"  # Proxy node 1 (host:container)
+      - "17001:17001"  # Proxy node 2 (host:container)
+      - "17002:17002"  # Proxy node 3 (host:container)
+      - "17003:17003"  # Proxy node 4 (host:container) - hidden initially, swapped in during SMIGRATED
+      - "18100:3000"  # HTTP API port (host:container)
+    depends_on:
+      - redis
+    profiles:
+      - e2e
+      - all
+
+  proxy-fault-injector:
+    build:
+      context: .
+      dockerfile: maintnotifications/e2e/cmd/proxy-fi-server/Dockerfile
+    container_name: proxy-fault-injector
+    ports:
+      - "15000:5000"  # Fault injector API port (host:container)
+    depends_on:
+      - cae-resp-proxy
+    environment:
+      - PROXY_API_URL=http://cae-resp-proxy:3000
+    profiles:
+      - e2e
+      - all
+
   sentinel-cluster:
     image: ${CLIENT_LIBS_TEST_IMAGE:-redislabs/client-libs-test:8.4.0}
     platform: linux/amd64

diff --git a/internal/maintnotifications/logs/log_messages.go b/internal/maintnotifications/logs/log_messages.go
@@ -121,6 +121,11 @@ const (
 	UnrelaxedTimeoutMessage                       = "clearing relaxed timeout"
 	ManagerNotInitializedMessage                  = "manager not initialized"
 	FailedToMarkForHandoffMessage                 = "failed to mark connection for handoff"
+	InvalidSeqIDInSMigratingNotificationMessage   = "invalid SeqID in SMIGRATING notification"
+	InvalidSeqIDInSMigratedNotificationMessage    = "invalid SeqID in SMIGRATED notification"
+	InvalidHostPortInSMigratedNotificationMessage = "invalid host:port in SMIGRATED notification"
+	SlotMigratingMessage                          = "slots migrating, applying relaxed timeout"
+	SlotMigratedMessage                           = "slots migrated, triggering cluster state reload"
 
 	// ========================================
 	// used in pool/conn
@@ -633,3 +638,43 @@ func ExtractDataFromLogMessage(logMessage string) map[string]interface{} {
 	// If JSON parsing fails, return empty map
 	return result
 }
+
+// Cluster notification functions
+func InvalidSeqIDInSMigratingNotification(seqID interface{}) string {
+	message := fmt.Sprintf("%s: %v", InvalidSeqIDInSMigratingNotificationMessage, seqID)
+	return appendJSONIfDebug(message, map[string]interface{}{
+		"seqID": fmt.Sprintf("%v", seqID),
+	})
+}
+
+func InvalidSeqIDInSMigratedNotification(seqID interface{}) string {
+	message := fmt.Sprintf("%s: %v", InvalidSeqIDInSMigratedNotificationMessage, seqID)
+	return appendJSONIfDebug(message, map[string]interface{}{
+		"seqID": fmt.Sprintf("%v", seqID),
+	})
+}
+
+func InvalidHostPortInSMigratedNotification(hostPort interface{}) string {
+	message := fmt.Sprintf("%s: %v", InvalidHostPortInSMigratedNotificationMessage, hostPort)
+	return appendJSONIfDebug(message, map[string]interface{}{
+		"hostPort": fmt.Sprintf("%v", hostPort),
+	})
+}
+
+func SlotMigrating(connID uint64, seqID int64, slotRanges []string) string {
+	message := fmt.Sprintf("conn[%d] %s seqID=%d slots=%v", connID, SlotMigratingMessage, seqID, slotRanges)
+	return appendJSONIfDebug(message, map[string]interface{}{
+		"connID":     connID,
+		"seqID":      seqID,
+		"slotRanges": slotRanges,
+	})
+}
+
+func SlotMigrated(seqID int64, hostPort string, slotRanges []string) string {
+	message := fmt.Sprintf("%s seqID=%d host:port=%s slots=%v", SlotMigratedMessage, seqID, hostPort, slotRanges)
+	return appendJSONIfDebug(message, map[string]interface{}{
+		"seqID":      seqID,
+		"hostPort":   hostPort,
+		"slotRanges": slotRanges,
+	})
+}
diff --git a/maintnotifications/README.md b/maintnotifications/README.md
@@ -2,8 +2,14 @@
 
 Seamless Redis connection handoffs during cluster maintenance operations without dropping connections.
 
-## ⚠️ **Important Note**
-**Maintenance notifications are currently supported only in standalone Redis clients.** Cluster clients (ClusterClient, FailoverClient, etc.) do not yet support this functionality.
+## Cluster Support
+
+**Cluster notifications are now supported for ClusterClient!**
+
+- **SMIGRATING**: `["SMIGRATING", SeqID, slot/range, ...]` - Relaxes timeouts when slots are being migrated
+- **SMIGRATED**: `["SMIGRATED", SeqID, host:port, slot/range, ...]` - Reloads cluster state when slot migration completes
+
+**Note:** Other maintenance notifications (MOVING, MIGRATING, MIGRATED, FAILING_OVER, FAILED_OVER) are supported only in standalone Redis clients. Cluster clients support SMIGRATING and SMIGRATED for cluster-specific slot migration handling.
 
 ## Quick Start
 

diff --git a/maintnotifications/e2e/README_SCENARIOS.md b/maintnotifications/e2e/README_SCENARIOS.md
@@ -7,17 +7,36 @@ This directory contains comprehensive end-to-end test scenarios for Redis push n
 
 ## Introduction
 
-To run those tests you would need a fault injector service, please review the client and feel free to implement your
-fault injector of choice. Those tests are tailored for Redis Enterprise, but can be adapted to other Redis distributions where
-a fault injector is available.
+These tests support two modes:
 
-Once you have fault injector service up and running, you can execute the tests by running the `run-e2e-tests.sh` script.
-there are three environment variables that need to be set before running the tests:
+### 1. Mock Proxy Mode (Default)
+Uses a local Docker-based proxy ([cae-resp-proxy](https://github.com/redis-developer/cae-resp-proxy)) to simulate Redis Enterprise behavior. This mode:
+- Runs entirely locally without external dependencies
+- Provides fast feedback for development
+- Simulates cluster topology changes
+- Supports SMIGRATING and SMIGRATED notifications
 
+To run in mock proxy mode:
+```bash
+make test.e2e
+```
+
+### 2. Real Fault Injector Mode
+Uses a real Redis Enterprise fault injector service for comprehensive testing. This mode:
+- Tests against actual Redis Enterprise clusters
+- Validates real-world scenarios
+- Requires external fault injector setup
+
+To run with a real fault injector, set these environment variables:
 - `REDIS_ENDPOINTS_CONFIG_PATH`: Path to Redis endpoints configuration
 - `FAULT_INJECTION_API_URL`: URL of the fault injector server
 - `E2E_SCENARIO_TESTS`: Set to `true` to enable scenario tests
 
+Then run:
+```bash
+./scripts/run-e2e-tests.sh
+```
+
 ## Test Scenarios Overview
 
 ### 1. Basic Push Notifications (`scenario_push_notifications_test.go`)
@@ -44,7 +63,28 @@ there are three environment variables that need to be set before running the tes
   - Notification delivery consistency
   - Handoff behavior per endpoint type
 
-### 3. Database Management Scenario (`scenario_database_management_test.go`)
+### 3. Unified Injector Scenarios (`scenario_unified_injector_test.go`)
+**Mock proxy-based notification testing**
+- **Purpose**: Test SMIGRATING and SMIGRATED notifications with simulated cluster topology changes
+- **Features Tested**:
+  - SMIGRATING notifications (slot migration in progress)
+  - SMIGRATED notifications (slot migration completed)
+  - Cluster topology changes (node swap simulation)
+  - Complex multi-step migration scenarios
+- **Configuration**: Uses local Docker proxy (cae-resp-proxy) with 4 nodes
+- **Duration**: ~10 seconds
+- **Key Validations**:
+  - Notification delivery and parsing
+  - Cluster state reload callbacks
+  - Client resilience during migrations
+  - Topology change handling
+- **Topology Simulation**:
+  - Starts with 4 proxy nodes (17000-17003)
+  - Initially exposes 3 nodes in CLUSTER SLOTS (17000, 17001, 17002)
+  - On SMIGRATED, swaps node 2 for node 3 (simulates node replacement)
+  - Verifies client continues to function after topology change
+
+### 4. Database Management Scenario (`scenario_database_management_test.go`)
 **Dynamic database creation and deletion**
 - **Purpose**: Test database lifecycle management via fault injector
 - **Features Tested**: CREATE_DATABASE, DELETE_DATABASE endpoints