diff --git a/controllers/druid/ordering.go b/controllers/druid/ordering.go index b2881245..2fae966b 100644 --- a/controllers/druid/ordering.go +++ b/controllers/druid/ordering.go @@ -18,7 +18,11 @@ under the License. */ package druid -import "github.com/apache/druid-operator/apis/druid/v1alpha1" +import ( + "sort" + + "github.com/apache/druid-operator/apis/druid/v1alpha1" +) var ( druidServicesOrder = []string{historical, overlord, middleManager, indexer, broker, coordinator, router} @@ -29,8 +33,10 @@ type ServiceGroup struct { spec v1alpha1.DruidNodeSpec } -// getNodeSpecsByOrder returns all NodeSpecs f a given Druid object. -// Recommended order is described at http://druid.io/docs/latest/operations/rolling-updates.html +// getNodeSpecsByOrder returns all NodeSpecs of a given Druid object in the +// recommended rolling-update order (see http://druid.io/docs/latest/operations/rolling-updates.html). +// Specs sharing a NodeType are sorted by map key so rollingDeploy stays +// deterministic across reconciles instead of flapping with map iteration. func getNodeSpecsByOrder(m *v1alpha1.Druid) []*ServiceGroup { scaledServiceSpecsByNodeType := map[string][]*ServiceGroup{} @@ -46,7 +52,11 @@ func getNodeSpecsByOrder(m *v1alpha1.Druid) []*ServiceGroup { allScaledServiceSpecs := make([]*ServiceGroup, 0, len(m.Spec.Nodes)) for _, t := range druidServicesOrder { - allScaledServiceSpecs = append(allScaledServiceSpecs, scaledServiceSpecsByNodeType[t]...) + specs := scaledServiceSpecsByNodeType[t] + sort.Slice(specs, func(i, j int) bool { + return specs[i].key < specs[j].key + }) + allScaledServiceSpecs = append(allScaledServiceSpecs, specs...) } return allScaledServiceSpecs diff --git a/controllers/druid/ordering_test.go b/controllers/druid/ordering_test.go index d48212b7..9332b865 100644 --- a/controllers/druid/ordering_test.go +++ b/controllers/druid/ordering_test.go @@ -19,6 +19,9 @@ under the License. package druid import ( + "reflect" + "sort" + "testing" "time" druidv1alpha1 "github.com/apache/druid-operator/apis/druid/v1alpha1" @@ -56,16 +59,137 @@ var _ = Describe("Test ordering logic", func() { return err == nil }, timeout, interval).Should(BeTrue()) }) - It("Should return an ordered list of nodes", func() { + It("Should return a deterministic, lexicographically ordered list of nodes within each NodeType", func() { orderedServiceGroups := getNodeSpecsByOrder(druid) - Expect(orderedServiceGroups[0].key).Should(MatchRegexp("historicals")) - Expect(orderedServiceGroups[1].key).Should(MatchRegexp("historicals")) - Expect(orderedServiceGroups[2].key).Should(Equal("overlords")) - Expect(orderedServiceGroups[3].key).Should(Equal("middle-managers")) - Expect(orderedServiceGroups[4].key).Should(Equal("indexers")) - Expect(orderedServiceGroups[5].key).Should(Equal("brokers")) - Expect(orderedServiceGroups[6].key).Should(Equal("coordinators")) - Expect(orderedServiceGroups[7].key).Should(Equal("routers")) + // Three historical tiers (historicalstier1–3) all share NodeType + // "historical" and must come back sorted by key so rollingDeploy + // can never roll two of them out at the same time. + Expect(orderedServiceGroups[0].key).Should(Equal("historicalstier1")) + Expect(orderedServiceGroups[1].key).Should(Equal("historicalstier2")) + Expect(orderedServiceGroups[2].key).Should(Equal("historicalstier3")) + Expect(orderedServiceGroups[3].key).Should(Equal("overlords")) + Expect(orderedServiceGroups[4].key).Should(Equal("middle-managers")) + Expect(orderedServiceGroups[5].key).Should(Equal("indexers")) + Expect(orderedServiceGroups[6].key).Should(Equal("brokers")) + Expect(orderedServiceGroups[7].key).Should(Equal("coordinators")) + Expect(orderedServiceGroups[8].key).Should(Equal("routers")) }) }) }) + +// determinismCallCount is the number of times getNodeSpecsByOrder is invoked +// per test to surface map-iteration non-determinism. Without the sort step in +// getNodeSpecsByOrder, randomized map iteration over m.Spec.Nodes makes the +// intra-NodeType order flap. With many specs sharing one NodeType and many +// repeated calls, the probability of observing at least one differing order +// approaches 1, which is exactly what we want for a regression test. +const determinismCallCount = 200 + +// makeMultiHistoricalDruid returns a Druid CR with several node specs sharing +// the same "historical" NodeType, plus one spec per other NodeType. This is +// the shape that triggered the bug fixed here: multiple StatefulSets/ +// Deployments belonging to a single NodeType. +func makeMultiHistoricalDruid() *druidv1alpha1.Druid { + return &druidv1alpha1.Druid{ + Spec: druidv1alpha1.DruidSpec{ + Nodes: map[string]druidv1alpha1.DruidNodeSpec{ + "historicalstier1": {NodeType: historical}, + "historicalstier2": {NodeType: historical}, + "historicalstier3": {NodeType: historical}, + "historicalstier4": {NodeType: historical}, + "brokers": {NodeType: broker}, + "coordinators": {NodeType: coordinator}, + "overlords": {NodeType: overlord}, + "middle-managers": {NodeType: middleManager}, + "indexers": {NodeType: indexer}, + "routers": {NodeType: router}, + }, + }, + } +} + +// keysOfServiceGroups extracts the ordered list of keys from a slice of +// *ServiceGroup so test assertions can compare orderings as plain strings. +func keysOfServiceGroups(specs []*ServiceGroup) []string { + out := make([]string, len(specs)) + for i, s := range specs { + out[i] = s.key + } + return out +} + +// TestGetNodeSpecsByOrder_DeterministicAcrossCalls invokes getNodeSpecsByOrder +// many times on the same Druid CR and asserts every call returns the exact +// same ordering. Before the fix, Go's randomized map iteration over +// m.Spec.Nodes causes the order of specs sharing a NodeType (e.g. the four +// "historical" entries) to flap between calls, so this test fails. With the +// sort.Slice fix, all calls return the same ordering and the test passes. +func TestGetNodeSpecsByOrder_DeterministicAcrossCalls(t *testing.T) { + druid := makeMultiHistoricalDruid() + + first := keysOfServiceGroups(getNodeSpecsByOrder(druid)) + + for i := 1; i < determinismCallCount; i++ { + got := keysOfServiceGroups(getNodeSpecsByOrder(druid)) + if !reflect.DeepEqual(first, got) { + t.Fatalf( + "getNodeSpecsByOrder is non-deterministic: call 0 returned %v, call %d returned %v", + first, i, got, + ) + } + } +} + +// TestGetNodeSpecsByOrder_LexicographicWithinNodeType asserts the contractual +// intra-NodeType ordering: ascending by spec key. This pins down the exact +// behavior the operator relies on for sequential rolling deploy. +func TestGetNodeSpecsByOrder_LexicographicWithinNodeType(t *testing.T) { + druid := makeMultiHistoricalDruid() + + got := keysOfServiceGroups(getNodeSpecsByOrder(druid)) + + wantHistoricals := []string{"historicalstier1", "historicalstier2", "historicalstier3", "historicalstier4"} + gotHistoricals := got[:len(wantHistoricals)] + + if !sort.StringsAreSorted(gotHistoricals) { + t.Errorf("historical specs must be sorted ascending by key, got %v", gotHistoricals) + } + if !reflect.DeepEqual(gotHistoricals, wantHistoricals) { + t.Errorf("historical block ordering mismatch: want %v, got %v", wantHistoricals, gotHistoricals) + } + + want := []string{ + "historicalstier1", "historicalstier2", "historicalstier3", "historicalstier4", + "overlords", + "middle-managers", + "indexers", + "brokers", + "coordinators", + "routers", + } + if !reflect.DeepEqual(got, want) { + t.Errorf("full ordering mismatch:\nwant %v\n got %v", want, got) + } +} + +// TestGetNodeSpecsByOrder_NodeTypeOrderPreserved guards against a regression +// in the cross-NodeType ordering defined by druidServicesOrder. +func TestGetNodeSpecsByOrder_NodeTypeOrderPreserved(t *testing.T) { + druid := &druidv1alpha1.Druid{ + Spec: druidv1alpha1.DruidSpec{ + Nodes: map[string]druidv1alpha1.DruidNodeSpec{ + "routers": {NodeType: router}, + "coordinators": {NodeType: coordinator}, + "brokers": {NodeType: broker}, + "historicals": {NodeType: historical}, + "overlords": {NodeType: overlord}, + }, + }, + } + + got := keysOfServiceGroups(getNodeSpecsByOrder(druid)) + want := []string{"historicals", "overlords", "brokers", "coordinators", "routers"} + if !reflect.DeepEqual(got, want) { + t.Errorf("NodeType ordering broken: want %v, got %v", want, got) + } +} diff --git a/controllers/druid/testdata/ordering.yaml b/controllers/druid/testdata/ordering.yaml index 00ce07d6..64614142 100644 --- a/controllers/druid/testdata/ordering.yaml +++ b/controllers/druid/testdata/ordering.yaml @@ -75,7 +75,16 @@ spec: - /bin/sh echo hello containerName: node-level image: hello-world - historicals2: + historicalstier2: + nodeType: "historical" + druid.port: 8080 + nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical" + replicas: 1 + runtime.properties: |- + druid.service=druid/historical + druid.segmentCache.locations=[{\"path\":\"/druid/data/segments\",\"maxSize\":10737418240}] + druid.server.maxSize=10737418240 + historicalstier3: nodeType: "historical" druid.port: 8080 nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical" @@ -119,7 +128,7 @@ spec: druid.realtime.cache.populateCache=true druid.indexer.runner.javaOptsArray=["-server","-Duser.timezone=UTC","-Dfile.encoding=UTF-8","-XX:+ExitOnOutOfMemoryError","-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager","--add-exports=java.base/jdk.internal.ref=ALL-UNNAMED","--add-exports=java.base/jdk.internal.misc=ALL-UNNAMED","--add-opens=java.base/java.lang=ALL-UNNAMED","--add-opens=java.base/java.io=ALL-UNNAMED","--add-opens=java.base/java.nio=ALL-UNNAMED","--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED","--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"] druid.indexer.task.restoreTasksOnRestart=true - historicals: + historicalstier1: nodeType: "historical" druid.port: 8080 nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical" diff --git a/e2e/configs/druid-rolling-deploy-cr.yaml b/e2e/configs/druid-rolling-deploy-cr.yaml new file mode 100644 index 00000000..07c96f99 --- /dev/null +++ b/e2e/configs/druid-rolling-deploy-cr.yaml @@ -0,0 +1,188 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# E2E fixture for the deterministic rolling deploy ordering test. +# Defines two StatefulSets that share the "historical" NodeType +# (historicalstier1 and historicalstier2) so the test can verify that +# the operator rolls them out one-at-a-time rather than concurrently. +apiVersion: "druid.apache.org/v1alpha1" +kind: "Druid" +metadata: + name: rolling-deploy-cluster +spec: + image: apache/druid:25.0.0 + startScript: /druid.sh + rollingDeploy: true + podLabels: + environment: stage + podAnnotations: + dummy: k8s_extn_needs_atleast_one_annotation + readinessProbe: + httpGet: + path: /status/health + port: 8088 + securityContext: + fsGroup: 0 + runAsUser: 0 + runAsGroup: 0 + containerSecurityContext: + privileged: true + services: + - spec: + type: ClusterIP + clusterIP: None + commonConfigMountPath: "/opt/druid/conf/druid/cluster/_common" + jvm.options: |- + -server + -XX:MaxDirectMemorySize=10240g + -Duser.timezone=UTC + -Dfile.encoding=UTF-8 + -Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager + common.runtime.properties: | + druid.zk.service.enabled=false + druid.discovery.type=k8s + druid.discovery.k8s.clusterIdentifier=druid-it + druid.serverview.type=http + druid.coordinator.loadqueuepeon.type=http + druid.indexer.runner.type=httpRemote + druid.metadata.storage.type=derby + druid.metadata.storage.connector.connectURI=jdbc:derby://localhost:1527/var/druid/metadata.db;create=true + druid.metadata.storage.connector.host=localhost + druid.metadata.storage.connector.port=1527 + druid.metadata.storage.connector.createTables=true + druid.storage.type=local + druid.storage.storageDirectory=/druid/deepstorage + druid.extensions.loadList=["druid-kubernetes-extensions"] + druid.selectors.indexing.serviceName=druid/overlord + druid.selectors.coordinator.serviceName=druid/coordinator + druid.lookup.enableLookupSyncOnStartup=false + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + nodes: + coordinators: + nodeType: "coordinator" + druid.port: 8088 + services: + - spec: + type: ClusterIP + clusterIP: None + nodeConfigMountPath: "/opt/druid/conf/druid/cluster/master/coordinator-overlord" + replicas: 1 + runtime.properties: | + druid.service=druid/coordinator + druid.coordinator.startDelay=PT30S + druid.coordinator.period=PT30S + druid.coordinator.asOverlord.enabled=true + druid.coordinator.asOverlord.overlordService=druid/overlord + druid.indexer.queue.startDelay=PT30S + extra.jvm.options: |- + -Xmx512m + -Xms512m + + brokers: + nodeType: "broker" + druid.port: 8088 + services: + - spec: + type: ClusterIP + clusterIP: None + nodeConfigMountPath: "/opt/druid/conf/druid/cluster/query/broker" + replicas: 1 + runtime.properties: | + druid.service=druid/broker + druid.broker.http.numConnections=5 + druid.server.http.numThreads=40 + druid.processing.buffer.sizeBytes=25000000 + druid.sql.enable=true + extra.jvm.options: |- + -Xmx512m + -Xms512m + + # Two historical specs sharing the SAME NodeType ("historical"). + # This is the case where the previous map-iteration ordering would + # flap between reconciles. With the deterministic ordering fix, + # historicalstier1 rolls out fully before historicalstier2. + historicalstier1: + nodeType: "historical" + druid.port: 8088 + services: + - spec: + type: ClusterIP + clusterIP: None + nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical" + replicas: 1 + runtime.properties: | + druid.service=druid/historical + druid.processing.buffer.sizeBytes=25000000 + druid.processing.numThreads=2 + druid.segmentCache.locations=[{"path":"/druid/data/segments","maxSize":10737418240}] + druid.server.maxSize=10737418240 + extra.jvm.options: |- + -Xmx512m + -Xms512m + + historicalstier2: + nodeType: "historical" + druid.port: 8088 + services: + - spec: + type: ClusterIP + clusterIP: None + nodeConfigMountPath: "/opt/druid/conf/druid/cluster/data/historical" + replicas: 1 + runtime.properties: | + druid.service=druid/historical + druid.processing.buffer.sizeBytes=25000000 + druid.processing.numThreads=2 + druid.segmentCache.locations=[{"path":"/druid/data/segments","maxSize":10737418240}] + druid.server.maxSize=10737418240 + extra.jvm.options: |- + -Xmx512m + -Xms512m +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: druid-cluster-rolling +rules: +- apiGroups: + - "" + resources: + - pods + - configmaps + verbs: + - '*' +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: druid-cluster-rolling +subjects: +- kind: ServiceAccount + name: default +roleRef: + kind: Role + name: druid-cluster-rolling + apiGroup: rbac.authorization.k8s.io diff --git a/e2e/e2e.sh b/e2e/e2e.sh index 4696696c..2a39cd0c 100755 --- a/e2e/e2e.sh +++ b/e2e/e2e.sh @@ -118,5 +118,6 @@ done # Start testing use-cases bash e2e/test-extra-common-config.sh +bash e2e/test-rolling-deploy-ordering.sh kind delete cluster diff --git a/e2e/test-rolling-deploy-ordering.sh b/e2e/test-rolling-deploy-ordering.sh new file mode 100755 index 00000000..3203aa29 --- /dev/null +++ b/e2e/test-rolling-deploy-ordering.sh @@ -0,0 +1,167 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# E2E test: Deterministic Rolling Deploy Ordering +# +# Verifies the fix that ensures, when rollingDeploy is enabled and multiple +# StatefulSets share the same NodeType, the operator rolls them out one at a +# time in a deterministic, lexicographic-by-key order. Without the fix, the +# two historical StatefulSets in the test fixture could update concurrently +# because Go map iteration ordering would flap across reconciles. + +set -o errexit +set -o pipefail +set -x + +NAMESPACE=${NAMESPACE:-druid} +CR_NAME=rolling-deploy-cluster +HISTORICAL_TIER1_STS="druid-${CR_NAME}-historicalstier1" +HISTORICAL_TIER2_STS="druid-${CR_NAME}-historicalstier2" + +CONCURRENT_THRESHOLD=0 +POLL_INTERVAL=5 +POLL_TIMEOUT=900 +REVISION_PICKUP_TIMEOUT=120 + +cleanup() { + echo "Cleaning up rolling-deploy test resources." + kubectl delete -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}" --ignore-not-found + for d in $(kubectl get pods -n "${NAMESPACE}" -l app=druid -l "druid_cr=${CR_NAME}" -o name 2>/dev/null); do + kubectl wait -n "${NAMESPACE}" "$d" --for=delete --timeout=5m || true + done +} +trap cleanup EXIT + +echo "Test: RollingDeployOrdering => START" + +kubectl apply -f e2e/configs/druid-rolling-deploy-cr.yaml -n "${NAMESPACE}" +sleep 10 + +for sts in "${HISTORICAL_TIER1_STS}" "${HISTORICAL_TIER2_STS}"; do + kubectl rollout status sts "${sts}" -n "${NAMESPACE}" --timeout=10m +done + +echo "Test: RollingDeployOrdering => initial cluster ready" + +t1_revision_before=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}') +t2_revision_before=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}') +echo "Pre-update revisions: tier1=${t1_revision_before} tier2=${t2_revision_before}" + +# Trigger a rolling update by bumping a pod annotation on both historical +# specs. podAnnotations flow into the PodTemplateSpec, so the StatefulSet +# controller creates a new updateRevision and actually rolls the pods. +# (workloadAnnotations only touch the StatefulSet object metadata and do +# NOT change the pod template, so they never produce a new revision.) +update_marker="rolling-deploy-test-$(date +%s)" +kubectl patch druid "${CR_NAME}" -n "${NAMESPACE}" --type=merge -p "{ + \"spec\": { + \"nodes\": { + \"historicalstier1\": {\"podAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}}, + \"historicalstier2\": {\"podAnnotations\": {\"rolling-deploy-test\": \"${update_marker}\"}} + } + } +}" + +echo "Test: RollingDeployOrdering => triggered update with marker=${update_marker}" + +# Fail fast: wait up to REVISION_PICKUP_TIMEOUT for tier1 to pick up a +# new updateRevision. If it never does, the patch didn't produce a pod +# template change and the rest of the test is pointless. +pickup_deadline=$(( $(date +%s) + REVISION_PICKUP_TIMEOUT )) +while [ "$(date +%s)" -lt "${pickup_deadline}" ]; do + t1_update=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}') + if [ "${t1_update}" != "${t1_revision_before}" ]; then + echo "tier1 picked up new revision: ${t1_update}" + break + fi + sleep "${POLL_INTERVAL}" +done +if [ "${t1_update}" = "${t1_revision_before}" ]; then + echo "Test: RollingDeployOrdering => FAILED (tier1 never received a new updateRevision within ${REVISION_PICKUP_TIMEOUT}s; patch may not affect the pod template)" + exit 1 +fi + +# Poll the rollout and observe two invariants: +# (1) At most one historical StatefulSet is mid-rollout +# (currentRevision != updateRevision) at any poll. +# (2) tier1 (lex-smaller key) finishes before tier2 starts. +t1_finished_at="" +t2_started_at="" +concurrent_observations=0 +deadline=$(( $(date +%s) + POLL_TIMEOUT )) + +while [ "$(date +%s)" -lt "${deadline}" ]; do + t1_current=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.currentRevision}') + t1_update=$(kubectl get sts "${HISTORICAL_TIER1_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}') + t2_current=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.currentRevision}') + t2_update=$(kubectl get sts "${HISTORICAL_TIER2_STS}" -n "${NAMESPACE}" -o 'jsonpath={.status.updateRevision}') + + t1_updating="false" + t2_updating="false" + if [ "${t1_current}" != "${t1_update}" ]; then t1_updating="true"; fi + if [ "${t2_current}" != "${t2_update}" ]; then t2_updating="true"; fi + + echo "poll: t1(updating=${t1_updating} cur=${t1_current} upd=${t1_update}) t2(updating=${t2_updating} cur=${t2_current} upd=${t2_update})" + + if [ "${t1_updating}" = "true" ] && [ "${t2_updating}" = "true" ]; then + concurrent_observations=$(( concurrent_observations + 1 )) + echo "WARN: both historicals are updating in the same poll (count=${concurrent_observations})" + fi + + if [ -z "${t1_finished_at}" ] && [ "${t1_update}" != "${t1_revision_before}" ] && [ "${t1_updating}" = "false" ]; then + t1_finished_at=$(date +%s) + echo "tier1 finished updating at ${t1_finished_at}" + fi + + if [ -z "${t2_started_at}" ] && [ "${t2_update}" != "${t2_revision_before}" ] && [ "${t2_updating}" = "true" ]; then + t2_started_at=$(date +%s) + echo "tier2 started updating at ${t2_started_at}" + fi + + # Both tiers have new revisions and neither is mid-rollout. + if [ "${t1_updating}" = "false" ] && [ "${t2_updating}" = "false" ] \ + && [ "${t1_update}" != "${t1_revision_before}" ] \ + && [ "${t2_update}" != "${t2_revision_before}" ]; then + echo "Both historicals finished rolling update." + break + fi + + sleep "${POLL_INTERVAL}" +done + +if [ "$(date +%s)" -ge "${deadline}" ]; then + echo "Test: RollingDeployOrdering => FAILED (rollout did not complete within ${POLL_TIMEOUT}s)" + exit 1 +fi + +if [ "${concurrent_observations}" -gt "${CONCURRENT_THRESHOLD}" ]; then + echo "Test: RollingDeployOrdering => FAILED (observed ${concurrent_observations} polls where both historicals were updating concurrently; threshold=${CONCURRENT_THRESHOLD})" + exit 1 +fi + +if [ -z "${t1_finished_at}" ] || [ -z "${t2_started_at}" ]; then + echo "Note: did not observe both transitions (t1_finished_at='${t1_finished_at}', t2_started_at='${t2_started_at}'). Rollout may have been faster than poll interval; relying on concurrency invariant alone." +elif [ "${t2_started_at}" -lt "${t1_finished_at}" ]; then + echo "Test: RollingDeployOrdering => FAILED (tier2 started at ${t2_started_at} BEFORE tier1 finished at ${t1_finished_at})" + exit 1 +else + echo "Order verified: tier1 finished at ${t1_finished_at}, tier2 started at ${t2_started_at}." +fi + +echo "Test: RollingDeployOrdering => SUCCESS"