From 9c757412a32ba05abd85e1f5f827304ecefe0eff Mon Sep 17 00:00:00 2001
From: Aru Raghuwanshi <aruraghuwanshi@gmail.com>
Date: Wed, 22 Apr 2026 23:11:40 -0700
Subject: [PATCH 1/2] fix: deterministic rollout order for multiple nodes per
 NodeType

Sort node specs by map key within each NodeType in getNodeSpecsByOrder
so rollingDeploy does not flap on Go map iteration. Add unit tests
(prove non-determinism pre-fix) and an E2E check for two historical
tiers (historicalstier1/2).
---
 controllers/druid/ordering.go            |  18 ++-
 controllers/druid/ordering_test.go       | 142 +++++++++++++++--
 controllers/druid/testdata/ordering.yaml |  13 +-
 e2e/configs/druid-rolling-deploy-cr.yaml | 188 +++++++++++++++++++++++
 e2e/e2e.sh                               |   1 +
 e2e/test-rolling-deploy-ordering.sh      | 160 +++++++++++++++++++
 6 files changed, 507 insertions(+), 15 deletions(-)
 create mode 100644 e2e/configs/druid-rolling-deploy-cr.yaml
 create mode 100755 e2e/test-rolling-deploy-ordering.sh

diff --git a/controllers/druid/ordering.go b/controllers/druid/ordering.go
index b2881245..2fae966b 100644
--- a/controllers/druid/ordering.go
+++ b/controllers/druid/ordering.go
@@ -18,7 +18,11 @@ under the License.
 */
 package druid
 
-import "github.com/apache/druid-operator/apis/druid/v1alpha1"
+import (
+	"sort"
+
+	"github.com/apache/druid-operator/apis/druid/v1alpha1"
+)
 
 var (
 	druidServicesOrder = []string{historical, overlord, middleManager, indexer, broker, coordinator, router}
@@ -29,8 +33,10 @@ type ServiceGroup struct {
 	spec v1alpha1.DruidNodeSpec
 }
 
-// getNodeSpecsByOrder returns all NodeSpecs f a given Druid object.
-// Recommended order is described at http://druid.io/docs/latest/operations/rolling-updates.html
+// getNodeSpecsByOrder returns all NodeSpecs of a given Druid object in the
+// recommended rolling-update order (see http://druid.io/docs/latest/operations/rolling-updates.html).
+// Specs sharing a NodeType are sorted by map key so rollingDeploy stays
+// deterministic across reconciles instead of flapping with map iteration.
 func getNodeSpecsByOrder(m *v1alpha1.Druid) []*ServiceGroup {
 
 	scaledServiceSpecsByNodeType := map[string][]*ServiceGroup{}
@@ -46,7 +52,11 @@ func getNodeSpecsByOrder(m *v1alpha1.Druid) []*ServiceGroup {
 	allScaledServiceSpecs := make([]*ServiceGroup, 0, len(m.Spec.Nodes))
 
 	for _, t := range druidServicesOrder {
-		allScaledServiceSpecs = append(allScaledServiceSpecs, scaledServiceSpecsByNodeType[t]...)
+		specs := scaledServiceSpecsByNodeType[t]
+		sort.Slice(specs, func(i, j int) bool {
+			return specs[i].key < specs[j].key
+		})
+		allScaledServiceSpecs = append(allScaledServiceSpecs, specs...)
 	}
 
 	return allScaledServiceSpecs
diff --git a/controllers/druid/ordering_test.go b/controllers/druid/ordering_test.go
index d48212b7..9332b865 100644
--- a/controllers/druid/ordering_test.go
+++ b/controllers/druid/ordering_test.go
@@ -19,6 +19,9 @@ under the License.
 package druid
 
 import (
+	"reflect"
+	"sort"
+	"testing"
 	"time"
 
 	druidv1alpha1 "github.com/apache/druid-operator/apis/druid/v1alpha1"
@@ -56,16 +59,137 @@ var _ = Describe("Test ordering logic", func() {
 				return err == nil
 			}, timeout, interval).Should(BeTrue())
 		})
-		It("Should return an ordered list of nodes", func() {
+		It("Should return a deterministic, lexicographically ordered list of nodes within each NodeType", func() {
 			orderedServiceGroups := getNodeSpecsByOrder(druid)
-			Expect(orderedServiceGroups[0].key).Should(MatchRegexp("historicals"))
-			Expect(orderedServiceGroups[1].key).Should(MatchRegexp("historicals"))
-			Expect(orderedServiceGroups[2].key).Should(Equal("overlords"))
-			Expect(orderedServiceGroups[3].key).Should(Equal("middle-managers"))
-			Expect(orderedServiceGroups[4].key).Should(Equal("indexers"))
-			Expect(orderedServiceGroups[5].key).Should(Equal("brokers"))
-			Expect(orderedServiceGroups[6].key).Should(Equal("coordinators"))
-			Expect(orderedServiceGroups[7].key).Should(Equal("routers"))
+			// Three historical tiers (historicalstier1–3) all share NodeType
+			// "historical" and must come back sorted by key so rollingDeploy
+			// can never roll two of them out at the same time.
+			Expect(orderedServiceGroups[0].key).Should(Equal("historicalstier1"))
+			Expect(orderedServiceGroups[1].key).Should(Equal("historicalstier2"))
+			Expect(orderedServiceGroups[2].key).Should(Equal("historicalstier3"))
+			Expect(orderedServiceGroups[3].key).Should(Equal("overlords"))
+			Expect(orderedServiceGroups[4].key).Should(Equal("middle-managers"))
+			Expect(orderedServiceGroups[5].key).Should(Equal("indexers"))
+			Expect(orderedServiceGroups[6].key).Should(Equal("brokers"))
+			Expect(orderedServiceGroups[7].key).Should(Equal("coordinators"))
+			Expect(orderedServiceGroups[8].key).Should(Equal("routers"))
 		})
 	})
 })
+
+// determinismCallCount is the number of times getNodeSpecsByOrder is invoked
+// per test to surface map-iteration non-determinism. Without the sort step in
+// getNodeSpecsByOrder, randomized map iteration over m.Spec.Nodes makes the
+// intra-NodeType order flap. With many specs sharing one NodeType and many
+// repeated calls, the probability of observing at least one differing order
+// approaches 1, which is exactly what we want for a regression test.
+const determinismCallCount = 200
+
+// makeMultiHistoricalDruid returns a Druid CR with several node specs sharing
+// the same "historical" NodeType, plus one spec per other NodeType. This is
+// the shape that triggered the bug fixed here: multiple StatefulSets/
+// Deployments belonging to a single NodeType.
+func makeMultiHistoricalDruid() *druidv1alpha1.Druid {
+	return &druidv1alpha1.Druid{
+		Spec: druidv1alpha1.DruidSpec{
+			Nodes: map[string]druidv1alpha1.DruidNodeSpec{
+				"historicalstier1": {NodeType: historical},
+				"historicalstier2": {NodeType: historical},
+				"historicalstier3": {NodeType: historical},
+				"historicalstier4": {NodeType: historical},
+				"brokers":          {NodeType: broker},
+				"coordinators":     {NodeType: coordinator},
+				"overlords":        {NodeType: overlord},
+				"middle-managers":  {NodeType: middleManager},
+				"indexers":         {NodeType: indexer},
+				"routers":          {NodeType: router},
+			},
+		},
+	}
+}
+
+// keysOfServiceGroups extracts the ordered list of keys from a slice of
+// *ServiceGroup so test assertions can compare orderings as plain strings.
+func keysOfServiceGroups(specs []*ServiceGroup) []string {
+	out := make([]string, len(specs))
+	for i, s := range specs {
+		out[i] = s.key
+	}
+	return out
+}
+
+// TestGetNodeSpecsByOrder_DeterministicAcrossCalls invokes getNodeSpecsByOrder
+// many times on the same Druid CR and asserts every call returns the exact
+// same ordering. Before the fix, Go's randomized map iteration over
+// m.Spec.Nodes causes the order of specs sharing a NodeType (e.g. the four
+// "historical" entries) to flap between calls, so this test fails. With the
+// sort.Slice fix, all calls return the same ordering and the test passes.
+func TestGetNodeSpecsByOrder_DeterministicAcrossCalls(t *testing.T) {
+	druid := makeMultiHistoricalDruid()
+
+	first := keysOfServiceGroups(getNodeSpecsByOrder(druid))
+
+	for i := 1; i < determinismCallCount; i++ {
+		got := keysOfServiceGroups(getNodeSpecsByOrder(druid))
+		if !reflect.DeepEqual(first, got) {
+			t.Fatalf(
+				"getNodeSpecsByOrder is non-deterministic: call 0 returned %v, call %d returned %v",
+				first, i, got,
+			)
+		}
+	}
+}
+
+// TestGetNodeSpecsByOrder_LexicographicWithinNodeType asserts the contractual
+// intra-NodeType ordering: ascending by spec key. This pins down the exact
+// behavior the operator relies on for sequential rolling deploy.
+func TestGetNodeSpecsByOrder_LexicographicWithinNodeType(t *testing.T) {
+	druid := makeMultiHistoricalDruid()
+
+	got := keysOfServiceGroups(getNodeSpecsByOrder(druid))
+
+	wantHistoricals := []string{"historicalstier1", "historicalstier2", "historicalstier3", "historicalstier4"}
+	gotHistoricals := got[:len(wantHistoricals)]
+
+	if !sort.StringsAreSorted(gotHistoricals) {
+		t.Errorf("historical specs must be sorted ascending by key, got %v", gotHistoricals)
+	}
+	if !reflect.DeepEqual(gotHistoricals, wantHistoricals) {
+		t.Errorf("historical block ordering mismatch: want %v, got %v", wantHistoricals, gotHistoricals)
+	}
+
+	want := []string{
+		"historicalstier1", "historicalstier2", "historicalstier3", "historicalstier4",
+		"overlords",
+		"middle-managers",
+		"indexers",
+		"brokers",
+		"coordinators",
+		"routers",
+	}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("full ordering mismatch:\nwant %v\n got %v", want, got)
+	}
+}
+
+// TestGetNodeSpecsByOrder_NodeTypeOrderPreserved guards against a regression
+// in the cross-NodeType ordering defined by druidServicesOrder.
+func TestGetNodeSpecsByOrder_NodeTypeOrderPreserved(t *testing.T) {
+	druid := &druidv1alpha1.Druid{
+		Spec: druidv1alpha1.DruidSpec{
+			Nodes: map[string]druidv1alpha1.DruidNodeSpec{
+				"routers":      {NodeType: router},
+				"coordinators": {NodeType: coordinator},
+				"brokers":      {NodeType: broker},
+				"historicals":  {NodeType: historical},
+				"overlords":    {NodeType: overlord},
+			},
+		},
+	}
+
+	got := keysOfServiceGroups(getNodeSpecsByOrder(druid))
+	want := []string{"historicals", "overlords", "brokers", "coordinators", "routers"}
+	if !reflect.DeepEqual(got, want) {
+		t.Errorf("NodeType ordering broken: want %v, got %v", want, got)
+	}
+}
diff --git a/controllers/druid/testdata/ordering.yaml b/controllers/druid/testdata/ordering.yaml
index 00ce07d6..64614142 100644
--- a/controllers/druid/testdata/ordering.yaml
+++ b/controllers/druid/testdata/ordering.yaml
@@ -75,7 +75,16 @@ spec:
             - /bin/sh echo hello
           containerName: node-level
           image: hello-world
-    historicals2:
+    historicalstier2:
+      nodeType: "historical"
+      druid.port: 8080
+      nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical"
+      replicas: 1
+      runtime.properties: |-
+        druid.service=druid/historical
+        druid.segmentCache.locations=[{\"path\":\"/druid/data/segments\",\"maxSize\":10737418240}]
+        druid.server.maxSize=10737418240
+    historicalstier3:
       nodeType: "historical"
       druid.port: 8080
       nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical"
@@ -119,7 +128,7 @@ spec:
         druid.realtime.cache.populateCache=true
         druid.indexer.runner.javaOptsArray=["-server","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager","--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED","--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED","--add-opens=java.base/java.lang=ALL-UNNAMED","--add-opens=java.base/java.io=ALL-UNNAMED","--add-opens=java.base/java.nio=ALL-UNNAMED","--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED","--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"]
         druid.indexer.task.restoreTasksOnRestart=true
-    historicals:
+    historicalstier1:
       nodeType: "historical"
       druid.port: 8080
       nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical"
diff --git a/e2e/configs/druid-rolling-deploy-cr.yaml b/e2e/configs/druid-rolling-deploy-cr.yaml
new file mode 100644
index 00000000..07c96f99
--- /dev/null
+++ b/e2e/configs/druid-rolling-deploy-cr.yaml
@@ -0,0 +1,188 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# E2E fixture for the deterministic rolling deploy ordering test.
+# Defines two StatefulSets that share the "historical" NodeType
+# (historicalstier1 and historicalstier2) so the test can verify that
+# the operator rolls them out one-at-a-time rather than concurrently.
+apiVersion: "druid.apache.org/v1alpha1"
+kind: "Druid"
+metadata:
+  name: rolling-deploy-cluster
+spec:
+  image: apache/druid:25.0.0
+  startScript: /druid.sh
+  rollingDeploy: true
+  podLabels:
+    environment: stage
+  podAnnotations:
+    dummy: k8s_extn_needs_atleast_one_annotation
+  readinessProbe:
+    httpGet:
+      path: /status/health
+      port: 8088
+  securityContext:
+    fsGroup: 0
+    runAsUser: 0
+    runAsGroup: 0
+  containerSecurityContext:
+    privileged: true
+  services:
+    - spec:
+        type: ClusterIP
+        clusterIP: None
+  commonConfigMountPath: "/opt/druid/conf/druid/cluster/_common"
+  jvm.options: |-
+    -server
+    -XX:MaxDirectMemorySize=10240g
+    -Duser.timezone=UTC
+    -Dfile.encoding=UTF-8
+    -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager
+  common.runtime.properties: |
+    druid.zk.service.enabled=false
+    druid.discovery.type=k8s
+    druid.discovery.k8s.clusterIdentifier=druid-it
+    druid.serverview.type=http
+    druid.coordinator.loadqueuepeon.type=http
+    druid.indexer.runner.type=httpRemote
+    druid.metadata.storage.type=derby
+    druid.metadata.storage.connector.connectURI=jdbc:derby://localhost:1527/var/druid/metadata.db;create=true
+    druid.metadata.storage.connector.host=localhost
+    druid.metadata.storage.connector.port=1527
+    druid.metadata.storage.connector.createTables=true
+    druid.storage.type=local
+    druid.storage.storageDirectory=/druid/deepstorage
+    druid.extensions.loadList=["druid-kubernetes-extensions"]
+    druid.selectors.indexing.serviceName=druid/overlord
+    druid.selectors.coordinator.serviceName=druid/coordinator
+    druid.lookup.enableLookupSyncOnStartup=false
+  env:
+    - name: POD_NAME
+      valueFrom:
+        fieldRef:
+          fieldPath: metadata.name
+    - name: POD_NAMESPACE
+      valueFrom:
+        fieldRef:
+          fieldPath: metadata.namespace
+  nodes:
+    coordinators:
+      nodeType: "coordinator"
+      druid.port: 8088
+      services:
+        - spec:
+            type: ClusterIP
+            clusterIP: None
+      nodeConfigMountPath: "/opt/druid/conf/druid/cluster/master/coordinator-overlord"
+      replicas: 1
+      runtime.properties: |
+        druid.service=druid/coordinator
+        druid.coordinator.startDelay=PT30S
+        druid.coordinator.period=PT30S
+        druid.coordinator.asOverlord.enabled=true
+        druid.coordinator.asOverlord.overlordService=druid/overlord
+        druid.indexer.queue.startDelay=PT30S
+      extra.jvm.options: |-
+        -Xmx512m
+        -Xms512m
+
+    brokers:
+      nodeType: "broker"
+      druid.port: 8088
+      services:
+        - spec:
+            type: ClusterIP
+            clusterIP: None
+      nodeConfigMountPath: "/opt/druid/conf/druid/cluster/query/broker"
+      replicas: 1
+      runtime.properties: |
+        druid.service=druid/broker
+        druid.broker.http.numConnections=5
+        druid.server.http.numThreads=40
+        druid.processing.buffer.sizeBytes=25000000
+        druid.sql.enable=true
+      extra.jvm.options: |-
+        -Xmx512m
+        -Xms512m
+
+    # Two historical specs sharing the SAME NodeType ("historical").
+    # This is the case where the previous map-iteration ordering would
+    # flap between reconciles. With the deterministic ordering fix,
+    # historicalstier1 rolls out fully before historicalstier2.
+    historicalstier1:
+      nodeType: "historical"
+      druid.port: 8088
+      services:
+        - spec:
+            type: ClusterIP
+            clusterIP: None
+      nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical"
+      replicas: 1
+      runtime.properties: |
+        druid.service=druid/historical
+        druid.processing.buffer.sizeBytes=25000000
+        druid.processing.numThreads=2
+        druid.segmentCache.locations=[{"path":"/druid/data/segments","maxSize":10737418240}]
+        druid.server.maxSize=10737418240
+      extra.jvm.options: |-
+        -Xmx512m
+        -Xms512m
+
+    historicalstier2:
+      nodeType: "historical"
+      druid.port: 8088
+      services:
+        - spec:
+            type: ClusterIP
+            clusterIP: None
+      nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical"
+      replicas: 1
+      runtime.properties: |
+        druid.service=druid/historical
+        druid.processing.buffer.sizeBytes=25000000
+        druid.processing.numThreads=2
+        druid.segmentCache.locations=[{"path":"/druid/data/segments","maxSize":10737418240}]
+        druid.server.maxSize=10737418240
+      extra.jvm.options: |-
+        -Xmx512m
+        -Xms512m
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: druid-cluster-rolling
+rules:
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  - configmaps
+  verbs:
+  - '*'
+---
+kind: RoleBinding
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+  name: druid-cluster-rolling
+subjects:
+- kind: ServiceAccount
+  name: default
+roleRef:
+  kind: Role
+  name: druid-cluster-rolling
+  apiGroup: rbac.authorization.k8s.io
diff --git a/e2e/e2e.sh b/e2e/e2e.sh
index 4696696c..2a39cd0c 100755
--- a/e2e/e2e.sh
+++ b/e2e/e2e.sh
@@ -118,5 +118,6 @@ done
 
 # Start testing use-cases
 bash e2e/test-extra-common-config.sh
+bash e2e/test-rolling-deploy-ordering.sh
 kind delete cluster
 
diff --git a/e2e/test-rolling-deploy-ordering.sh b/e2e/test-rolling-deploy-ordering.sh
new file mode 100755
index 00000000..ca607a52
--- /dev/null
+++ b/e2e/test-rolling-deploy-ordering.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# E2E test: Deterministic Rolling Deploy Ordering
+#
+# Verifies the fix that ensures, when rollingDeploy is enabled and multiple
+# StatefulSets share the same NodeType, the operator rolls them out one at a
+# time in a deterministic, lexicographic-by-key order. Without the fix, the
+# two historical StatefulSets in the test fixture could update concurrently
+# because Go map iteration ordering would flap across reconciles.
+
+set -o errexit
+set -o pipefail
+set -x
+
+# NAMESPACE is exported by e2e.sh; default if running standalone.
+NAMESPACE=${NAMESPACE:-druid}
+CR_NAME=rolling-deploy-cluster
+HISTORICAL_TIER1_STS="druid-${CR_NAME}-historicalstier1"
+HISTORICAL_TIER2_STS="druid-${CR_NAME}-historicalstier2"
+
+# concurrent_observations_threshold is the maximum number of polling
+# windows in which both historical StatefulSets are allowed to be
+# updating at the same time during the rollout. With the fix this
+# must remain 0 throughout the rollout.
+concurrent_observations_threshold=0
+
+# poll_interval_seconds and poll_timeout_seconds bound the rollout
+# observation loop so the test fails fast on a stuck cluster.
+poll_interval_seconds=5
+poll_timeout_seconds=900
+
+echo "Test: RollingDeployOrdering => START"
+
+# Apply the test fixture and wait for both historical StatefulSets to
+# become Ready before mutating anything. RollingDeploy gating only
+# applies to updates after Generation > 1.
+kubectl apply -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}"
+sleep 10
+
+for sts in "${HISTORICAL_TIER1_STS}" "${HISTORICAL_TIER2_STS}"; do
+  kubectl rollout status sts "${sts}" -n "${NAMESPACE}" --timeout=10m
+done
+
+echo "Test: RollingDeployOrdering => initial cluster ready"
+
+# Capture the revisions both historicals are running at before the
+# mutation so we can detect when each one begins and finishes its update.
+t1_revision_before=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
+t2_revision_before=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
+echo "Pre-update revisions: tier1=${t1_revision_before} tier2=${t2_revision_before}"
+
+# Trigger a rolling update by bumping a workload annotation on both
+# historical specs. A workloadAnnotation change forces the operator to
+# update both StatefulSets, which is exactly the scenario where shared
+# NodeType ordering must remain deterministic.
+update_marker="rolling-deploy-test-$(date +%s)"
+kubectl patch druid "${CR_NAME}" -n "${NAMESPACE}" --type=merge -p "{
+  \"spec\": {
+    \"nodes\": {
+      \"historicalstier1\": {\"workloadAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}},
+      \"historicalstier2\": {\"workloadAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}}
+    }
+  }
+}"
+
+echo "Test: RollingDeployOrdering => triggered update with marker=${update_marker}"
+
+# Poll the rollout and observe two invariants:
+#   (1) At any single moment, AT MOST ONE historical StatefulSet should
+#       have CurrentRevision != UpdateRevision (i.e. be mid-rollout).
+#   (2) historicalstier1 (lex-smaller key) must finish its update before
+#       historicalstier2 starts its update.
+t1_finished_at=""
+t2_started_at=""
+concurrent_observations=0
+deadline=$(( $(date +%s) + poll_timeout_seconds ))
+
+while [ "$(date +%s)" -lt "${deadline}" ]; do
+  t1_current=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.currentRevision}')
+  t1_update=$(kubectl  get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
+  t2_current=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.currentRevision}')
+  t2_update=$(kubectl  get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
+
+  t1_updating="false"
+  t2_updating="false"
+  if [ "${t1_current}" != "${t1_update}" ]; then t1_updating="true"; fi
+  if [ "${t2_current}" != "${t2_update}" ]; then t2_updating="true"; fi
+
+  echo "poll: t1(updating=${t1_updating} cur=${t1_current} upd=${t1_update}) t2(updating=${t2_updating} cur=${t2_current} upd=${t2_update})"
+
+  if [ "${t1_updating}" = "true" ] && [ "${t2_updating}" = "true" ]; then
+    concurrent_observations=$(( concurrent_observations + 1 ))
+    echo "WARN: both historicals are updating in the same poll (count=${concurrent_observations})"
+  fi
+
+  if [ -z "${t1_finished_at}" ] && [ "${t1_update}" != "${t1_revision_before}" ] && [ "${t1_updating}" = "false" ]; then
+    t1_finished_at=$(date +%s)
+    echo "t1 finished updating at ${t1_finished_at}"
+  fi
+
+  if [ -z "${t2_started_at}" ] && [ "${t2_update}" != "${t2_revision_before}" ] && [ "${t2_updating}" = "true" ]; then
+    t2_started_at=$(date +%s)
+    echo "t2 started updating at ${t2_started_at}"
+  fi
+
+  if [ "${t1_updating}" = "false" ] && [ "${t2_updating}" = "false" ] \
+     && [ "${t1_update}" != "${t1_revision_before}" ] \
+     && [ "${t2_update}" != "${t2_revision_before}" ]; then
+    echo "Both historicals finished rolling update."
+    break
+  fi
+
+  sleep "${poll_interval_seconds}"
+done
+
+if [ "$(date +%s)" -ge "${deadline}" ]; then
+  echo "Test: RollingDeployOrdering => FAILED (rollout did not complete within ${poll_timeout_seconds}s)"
+  exit 1
+fi
+
+# Invariant 1: deterministic ordering => at most threshold concurrent windows.
+if [ "${concurrent_observations}" -gt "${concurrent_observations_threshold}" ]; then
+  echo "Test: RollingDeployOrdering => FAILED (observed ${concurrent_observations} polls where both historicals were updating concurrently; threshold=${concurrent_observations_threshold})"
+  exit 1
+fi
+
+# Invariant 2: t1 (lex-smaller key) must complete before t2 starts.
+if [ -z "${t1_finished_at}" ] || [ -z "${t2_started_at}" ]; then
+  echo "Note: did not observe both transitions (t1_finished_at='${t1_finished_at}', t2_started_at='${t2_started_at}'). Rollout may have been faster than poll interval; relying on concurrency invariant alone."
+elif [ "${t2_started_at}" -lt "${t1_finished_at}" ]; then
+  echo "Test: RollingDeployOrdering => FAILED (t2 started updating at ${t2_started_at} BEFORE t1 finished at ${t1_finished_at})"
+  exit 1
+else
+  echo "Order verified: t1 finished at ${t1_finished_at}, t2 started at ${t2_started_at} (t1-before-t2)."
+fi
+
+echo "Cleaning up rolling-deploy test resources."
+kubectl delete -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}" --ignore-not-found
+for d in $(kubectl get pods -n "${NAMESPACE}" -l app=druid -l "druid_cr=${CR_NAME}" -o name); do
+  kubectl wait -n "${NAMESPACE}" "$d" --for=delete --timeout=5m || true
+done
+
+echo "Test: RollingDeployOrdering => SUCCESS"

From ed1451a09ef97c3473540356fe07066185d3a488 Mon Sep 17 00:00:00 2001
From: Aru Raghuwanshi <aruraghuwanshi@gmail.com>
Date: Fri, 24 Apr 2026 16:31:25 -0700
Subject: [PATCH 2/2] fix(e2e): patch podAnnotations to trigger real
 StatefulSet rollout

workloadAnnotations only touch StatefulSet object metadata, not the pod
template, so updateRevision never changes and the test times out at 900s.

Switch to podAnnotations (which flow into PodTemplateSpec), add trap-based
cleanup, and fail fast if tier1 never picks up a new revision.
---
 e2e/test-rolling-deploy-ordering.sh | 91 ++++++++++++++++-------------
 1 file changed, 49 insertions(+), 42 deletions(-)

diff --git a/e2e/test-rolling-deploy-ordering.sh b/e2e/test-rolling-deploy-ordering.sh
index ca607a52..3203aa29 100755
--- a/e2e/test-rolling-deploy-ordering.sh
+++ b/e2e/test-rolling-deploy-ordering.sh
@@ -29,28 +29,27 @@ set -o errexit
 set -o pipefail
 set -x
 
-# NAMESPACE is exported by e2e.sh; default if running standalone.
 NAMESPACE=${NAMESPACE:-druid}
 CR_NAME=rolling-deploy-cluster
 HISTORICAL_TIER1_STS="druid-${CR_NAME}-historicalstier1"
 HISTORICAL_TIER2_STS="druid-${CR_NAME}-historicalstier2"
 
-# concurrent_observations_threshold is the maximum number of polling
-# windows in which both historical StatefulSets are allowed to be
-# updating at the same time during the rollout. With the fix this
-# must remain 0 throughout the rollout.
-concurrent_observations_threshold=0
+CONCURRENT_THRESHOLD=0
+POLL_INTERVAL=5
+POLL_TIMEOUT=900
+REVISION_PICKUP_TIMEOUT=120
 
-# poll_interval_seconds and poll_timeout_seconds bound the rollout
-# observation loop so the test fails fast on a stuck cluster.
-poll_interval_seconds=5
-poll_timeout_seconds=900
+cleanup() {
+  echo "Cleaning up rolling-deploy test resources."
+  kubectl delete -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}" --ignore-not-found
+  for d in $(kubectl get pods -n "${NAMESPACE}" -l app=druid -l "druid_cr=${CR_NAME}" -o name 2>/dev/null); do
+    kubectl wait -n "${NAMESPACE}" "$d" --for=delete --timeout=5m || true
+  done
+}
+trap cleanup EXIT
 
 echo "Test: RollingDeployOrdering => START"
 
-# Apply the test fixture and wait for both historical StatefulSets to
-# become Ready before mutating anything. RollingDeploy gating only
-# applies to updates after Generation > 1.
 kubectl apply -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}"
 sleep 10
 
@@ -60,37 +59,52 @@ done
 
 echo "Test: RollingDeployOrdering => initial cluster ready"
 
-# Capture the revisions both historicals are running at before the
-# mutation so we can detect when each one begins and finishes its update.
 t1_revision_before=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
 t2_revision_before=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
 echo "Pre-update revisions: tier1=${t1_revision_before} tier2=${t2_revision_before}"
 
-# Trigger a rolling update by bumping a workload annotation on both
-# historical specs. A workloadAnnotation change forces the operator to
-# update both StatefulSets, which is exactly the scenario where shared
-# NodeType ordering must remain deterministic.
+# Trigger a rolling update by bumping a pod annotation on both historical
+# specs. podAnnotations flow into the PodTemplateSpec, so the StatefulSet
+# controller creates a new updateRevision and actually rolls the pods.
+# (workloadAnnotations only touch the StatefulSet object metadata and do
+# NOT change the pod template, so they never produce a new revision.)
 update_marker="rolling-deploy-test-$(date +%s)"
 kubectl patch druid "${CR_NAME}" -n "${NAMESPACE}" --type=merge -p "{
   \"spec\": {
     \"nodes\": {
-      \"historicalstier1\": {\"workloadAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}},
-      \"historicalstier2\": {\"workloadAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}}
+      \"historicalstier1\": {\"podAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}},
+      \"historicalstier2\": {\"podAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}}
     }
   }
 }"
 
 echo "Test: RollingDeployOrdering => triggered update with marker=${update_marker}"
 
+# Fail fast: wait up to REVISION_PICKUP_TIMEOUT for tier1 to pick up a
+# new updateRevision. If it never does, the patch didn't produce a pod
+# template change and the rest of the test is pointless.
+pickup_deadline=$(( $(date +%s) + REVISION_PICKUP_TIMEOUT ))
+while [ "$(date +%s)" -lt "${pickup_deadline}" ]; do
+  t1_update=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}')
+  if [ "${t1_update}" != "${t1_revision_before}" ]; then
+    echo "tier1 picked up new revision: ${t1_update}"
+    break
+  fi
+  sleep "${POLL_INTERVAL}"
+done
+if [ "${t1_update}" = "${t1_revision_before}" ]; then
+  echo "Test: RollingDeployOrdering => FAILED (tier1 never received a new updateRevision within ${REVISION_PICKUP_TIMEOUT}s; patch may not affect the pod template)"
+  exit 1
+fi
+
 # Poll the rollout and observe two invariants:
-#   (1) At any single moment, AT MOST ONE historical StatefulSet should
-#       have CurrentRevision != UpdateRevision (i.e. be mid-rollout).
-#   (2) historicalstier1 (lex-smaller key) must finish its update before
-#       historicalstier2 starts its update.
+#   (1) At most one historical StatefulSet is mid-rollout
+#       (currentRevision != updateRevision) at any poll.
+#   (2) tier1 (lex-smaller key) finishes before tier2 starts.
 t1_finished_at=""
 t2_started_at=""
 concurrent_observations=0
-deadline=$(( $(date +%s) + poll_timeout_seconds ))
+deadline=$(( $(date +%s) + POLL_TIMEOUT ))
 
 while [ "$(date +%s)" -lt "${deadline}" ]; do
   t1_current=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.currentRevision}')
@@ -112,14 +126,15 @@ while [ "$(date +%s)" -lt "${deadline}" ]; do
 
   if [ -z "${t1_finished_at}" ] && [ "${t1_update}" != "${t1_revision_before}" ] && [ "${t1_updating}" = "false" ]; then
     t1_finished_at=$(date +%s)
-    echo "t1 finished updating at ${t1_finished_at}"
+    echo "tier1 finished updating at ${t1_finished_at}"
   fi
 
   if [ -z "${t2_started_at}" ] && [ "${t2_update}" != "${t2_revision_before}" ] && [ "${t2_updating}" = "true" ]; then
     t2_started_at=$(date +%s)
-    echo "t2 started updating at ${t2_started_at}"
+    echo "tier2 started updating at ${t2_started_at}"
   fi
 
+  # Both tiers have new revisions and neither is mid-rollout.
   if [ "${t1_updating}" = "false" ] && [ "${t2_updating}" = "false" ] \
      && [ "${t1_update}" != "${t1_revision_before}" ] \
      && [ "${t2_update}" != "${t2_revision_before}" ]; then
@@ -127,34 +142,26 @@ while [ "$(date +%s)" -lt "${deadline}" ]; do
     break
   fi
 
-  sleep "${poll_interval_seconds}"
+  sleep "${POLL_INTERVAL}"
 done
 
 if [ "$(date +%s)" -ge "${deadline}" ]; then
-  echo "Test: RollingDeployOrdering => FAILED (rollout did not complete within ${poll_timeout_seconds}s)"
+  echo "Test: RollingDeployOrdering => FAILED (rollout did not complete within ${POLL_TIMEOUT}s)"
   exit 1
 fi
 
-# Invariant 1: deterministic ordering => at most threshold concurrent windows.
-if [ "${concurrent_observations}" -gt "${concurrent_observations_threshold}" ]; then
-  echo "Test: RollingDeployOrdering => FAILED (observed ${concurrent_observations} polls where both historicals were updating concurrently; threshold=${concurrent_observations_threshold})"
+if [ "${concurrent_observations}" -gt "${CONCURRENT_THRESHOLD}" ]; then
+  echo "Test: RollingDeployOrdering => FAILED (observed ${concurrent_observations} polls where both historicals were updating concurrently; threshold=${CONCURRENT_THRESHOLD})"
   exit 1
 fi
 
-# Invariant 2: t1 (lex-smaller key) must complete before t2 starts.
 if [ -z "${t1_finished_at}" ] || [ -z "${t2_started_at}" ]; then
   echo "Note: did not observe both transitions (t1_finished_at='${t1_finished_at}', t2_started_at='${t2_started_at}'). Rollout may have been faster than poll interval; relying on concurrency invariant alone."
 elif [ "${t2_started_at}" -lt "${t1_finished_at}" ]; then
-  echo "Test: RollingDeployOrdering => FAILED (t2 started updating at ${t2_started_at} BEFORE t1 finished at ${t1_finished_at})"
+  echo "Test: RollingDeployOrdering => FAILED (tier2 started at ${t2_started_at} BEFORE tier1 finished at ${t1_finished_at})"
   exit 1
 else
-  echo "Order verified: t1 finished at ${t1_finished_at}, t2 started at ${t2_started_at} (t1-before-t2)."
+  echo "Order verified: tier1 finished at ${t1_finished_at}, tier2 started at ${t2_started_at}."
 fi
 
-echo "Cleaning up rolling-deploy test resources."
-kubectl delete -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}" --ignore-not-found
-for d in $(kubectl get pods -n "${NAMESPACE}" -l app=druid -l "druid_cr=${CR_NAME}" -o name); do
-  kubectl wait -n "${NAMESPACE}" "$d" --for=delete --timeout=5m || true
-done
-
 echo "Test: RollingDeployOrdering => SUCCESS"