From 401819da7bda0cb1d484f499207dca310c2f46b5 Mon Sep 17 00:00:00 2001 From: Kai Hudalla Date: Tue, 10 Mar 2026 09:20:15 +0100 Subject: [PATCH 1/2] [#312] Do not require HTTP server to start The QPid Dispatch Router tries to create an HTTP server during startup. This HTTP server has been used in k8s probes to determine the router's health status. The creation of the HTTP server may fail if the router is not running with an abundant amount of resources. The right thing to do would probably be to fix the underlying issue that causes the HTTP server to fail, which seems to be caused by a bug in libwebsockets that fails to correctly determine the available resources when running in a kind managed k8s environment. This seems to have been done already in the latest version of libwebsockets but there is no Dispatch Router container image (publicly) available, that uses a version of libwebsockets including the fix. Therefore, this commit changes the k8s probes to check the availability of the internal AMQP listener instead of the healthz resource exposed via the HTTP server. This way, the router might still fail to create the HTTP server but because it is no longer needed, the router it will be considered healthy as long as the internal AMQP listener is available and thus reach the Ready state. Fixes #312 --- charts/hono/Chart.yaml | 2 +- charts/hono/config/router/qdrouterd.json | 15 ++++----- .../dispatch-router-deployment.yaml | 33 ++++++++++++++++--- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/charts/hono/Chart.yaml b/charts/hono/Chart.yaml index 67b9b4e0..1c6268d6 100755 --- a/charts/hono/Chart.yaml +++ b/charts/hono/Chart.yaml @@ -15,7 +15,7 @@ name: hono description: | Eclipse Hono™ provides remote service interfaces for connecting large numbers of IoT devices to a back end and interacting with them in a uniform way regardless of the device communication protocol. -version: 2.7.0 +version: 2.7.1 # Version of Hono being deployed by the chart appVersion: 2.7.0 keywords: diff --git a/charts/hono/config/router/qdrouterd.json b/charts/hono/config/router/qdrouterd.json index d60ba6fb..1749278f 100644 --- a/charts/hono/config/router/qdrouterd.json +++ b/charts/hono/config/router/qdrouterd.json @@ -33,7 +33,8 @@ "linkCapacity": 100, "authenticatePeer": true, "saslMechanisms": "PLAIN", - "saslPlugin": "Hono Auth" + "saslPlugin": "Hono Auth", + "http": false }], {{- if .Values.amqpMessagingNetworkExample.insecurePortEnabled }} @@ -43,7 +44,8 @@ "linkCapacity": 100, "authenticatePeer": true, "saslMechanisms": "PLAIN", - "saslPlugin": "Hono Auth" + "saslPlugin": "Hono Auth", + "http": false }], {{- end }} @@ -63,13 +65,8 @@ "port": 5673, "linkCapacity": 100, "authenticatePeer": true, - "saslMechanisms": "EXTERNAL" - }], - - ["listener", { - "host": "0.0.0.0", - "port": 8088, - "http": true + "saslMechanisms": "EXTERNAL", + "http": false }], ["sslProfile", { diff --git a/charts/hono/templates/dispatch-router/dispatch-router-deployment.yaml b/charts/hono/templates/dispatch-router/dispatch-router-deployment.yaml index aafa3b7b..460dabf8 100644 --- a/charts/hono/templates/dispatch-router/dispatch-router-deployment.yaml +++ b/charts/hono/templates/dispatch-router/dispatch-router-deployment.yaml @@ -1,7 +1,7 @@ {{- $amqpEnabled := and ( has "amqp" .Values.messagingNetworkTypes ) .Values.amqpMessagingNetworkExample.enabled -}} {{- if $amqpEnabled }} # -# Copyright (c) 2019, 2022 Contributors to the Eclipse Foundation +# Copyright (c) 2019 Contributors to the Eclipse Foundation # # See the NOTICE file(s) distributed with this work for additional # information regarding copyright ownership. @@ -57,14 +57,37 @@ spec: - name: internal containerPort: 5673 protocol: TCP - - name: health - containerPort: {{ include "hono.healthCheckPort" . }} - protocol: TCP {{- with .Values.amqpMessagingNetworkExample.dispatchRouter.resources }} resources: {{- . | toYaml | nindent 10 }} {{- end }} - {{- include "hono.component.healthChecks" $args | nindent 8 }} + # In general, we would rather use the /healthz resource provided by qdrouterd's HTTP server. + # However, the libwebsockets library used by older dispatch router images will try to allocate + # memory for a lot of file descriptors and fail with an OOM during startup, effectively preventing + # the HTTP server from starting and in turn preventing the pod to ever reach the ready state. + # see https://github.com/eclipse-packages/packages/issues/312 + # + # We therefore use a simple TCP socket check on the internal AMQP port for the liveness and readiness probes, + # which is sufficient for our use case. + {{- $probes := mergeOverwrite ( .Values.probes | deepCopy ) ( $args.componentConfig.probes | default dict | deepCopy ) }} + {{- $deprecatedLivenessProbeInitialDelaySeconds := default .Values.livenessProbeInitialDelaySeconds $args.componentConfig.livenessProbeInitialDelaySeconds }} + {{- $deprecatedReadinessProbeInitialDelaySeconds := default .Values.readinessProbeInitialDelaySeconds $args.componentConfig.readinessProbeInitialDelaySeconds }} + livenessProbe: + tcpSocket: + port: 5673 + periodSeconds: {{ $probes.livenessProbe.periodSeconds }} + failureThreshold: {{ $probes.livenessProbe.failureThreshold }} + initialDelaySeconds: {{ default $probes.livenessProbe.initialDelaySeconds $deprecatedLivenessProbeInitialDelaySeconds }} + successThreshold: {{ $probes.livenessProbe.successThreshold }} + timeoutSeconds: {{ $probes.livenessProbe.timeoutSeconds }} + readinessProbe: + tcpSocket: + port: 5673 + periodSeconds: {{ $probes.readinessProbe.periodSeconds }} + failureThreshold: {{ $probes.readinessProbe.failureThreshold }} + initialDelaySeconds: {{ default $probes.readinessProbe.initialDelaySeconds $deprecatedReadinessProbeInitialDelaySeconds }} + successThreshold: {{ $probes.readinessProbe.successThreshold }} + timeoutSeconds: {{ $probes.readinessProbe.timeoutSeconds }} securityContext: privileged: false volumeMounts: From 7e2dc448a78d5d86f9a0e454c4baa7733bb88c30 Mon Sep 17 00:00:00 2001 From: Kai Hudalla Date: Tue, 10 Mar 2026 09:32:39 +0100 Subject: [PATCH 2/2] Update to more recent k8s version Also updated tools and actions being used by the CI job to most recent versions. --- .github/ct.yaml | 7 ++-- .github/workflows/ci.yaml | 68 ++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/.github/ct.yaml b/.github/ct.yaml index 87d72502..ce5ece37 100644 --- a/.github/ct.yaml +++ b/.github/ct.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2019, 2022 Contributors to the Eclipse Foundation +# Copyright (c) 2019 Contributors to the Eclipse Foundation # # See the NOTICE file(s) distributed with this work for additional # information regarding copyright ownership. @@ -9,8 +9,8 @@ # # SPDX-License-Identifier: EPL-2.0 -# consider helm install to be failed after 30 minutes -helm-extra-args: --timeout 30m +# consider helm install to be failed after 10 minutes +helm-extra-args: --timeout 10m check-version-increment: true debug: true chart-dirs: @@ -25,3 +25,4 @@ chart-repos: excluded-charts: - telemetry-e2e - cloud2edge +target-branch: master diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 5031a399..609273bb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2019, 2023 Contributors to the Eclipse Foundation +# Copyright (c) 2019 Contributors to the Eclipse Foundation # # See the NOTICE file(s) distributed with this work for additional # information regarding copyright ownership. @@ -13,8 +13,8 @@ name: ci env: CONFIG_OPTION_CHART_TESTING: "--config .github/ct.yaml" - VERSION_CHART_TESTING: "v3.5.1" - VERSION_HELM: "v3.9.2" + VERSION_CHART_TESTING: "3.14.0" + VERSION_HELM: "3.19.0" VERSION_PYTHON: "3.8" on: pull_request: @@ -27,7 +27,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Lint Bash scripts uses: docker://koalaman/shellcheck-alpine:v0.9.0 with: @@ -35,21 +35,23 @@ jobs: lint-chart: runs-on: ubuntu-latest - needs: lint-bash-scripts steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Fetch history for chart testing + # allow skipping this step when running the workflow locally in act, as act does not support shallow clones + # and thus the fetch would fail + if: ${{ !env.ACT }} run: git fetch --prune --unshallow - name: Set up Helm - uses: azure/setup-helm@v3.5 + uses: azure/setup-helm@v4 with: version: ${{ env.VERSION_HELM }} - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v6 with: python-version: ${{ env.VERSION_PYTHON }} - name: Set up chart-testing - uses: helm/chart-testing-action@v2.2.1 + uses: helm/chart-testing-action@v2 with: version: ${{ env.VERSION_CHART_TESTING }} - name: Run chart-testing (lint) @@ -64,17 +66,18 @@ jobs: # which a folder exists at # https://github.com/yannh/kubernetes-json-schema/ k8s: - - v1.25.13 - - v1.26.8 - - v1.27.5 - - v1.28.1 + - v1.32.9 + - v1.33.8 + - v1.34.4 + - v1.35.1 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Fetch history for chart testing + if: ${{ !env.ACT }} run: git fetch --prune --unshallow - name: Set up Helm - uses: azure/setup-helm@v3.5 + uses: azure/setup-helm@v4 with: version: ${{ env.VERSION_HELM }} - name: Run kubeval @@ -85,33 +88,32 @@ jobs: install-chart: name: install-chart runs-on: ubuntu-latest - needs: - - lint-chart - - kubeval-chart + needs: kubeval-chart strategy: matrix: k8s: # the versions supported by chart-testing are the tags # available for the docker.io/kindest/node image # https://hub.docker.com/r/kindest/node/tags - - v1.25.11 - - v1.26.6 - - v1.27.3 - - v1.28.0 + - v1.32.11 + - v1.33.7 + - v1.34.3 + - v1.35.1 steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Fetch history for chart testing + if: ${{ !env.ACT }} run: git fetch --prune --unshallow - name: Set up Helm - uses: azure/setup-helm@v3.5 + uses: azure/setup-helm@v4 with: version: ${{ env.VERSION_HELM }} - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v6 with: python-version: ${{ env.VERSION_PYTHON }} - name: Set up chart-testing - uses: helm/chart-testing-action@v2.2.1 + uses: helm/chart-testing-action@v2 with: version: ${{ env.VERSION_CHART_TESTING }} - name: Check for changed charts @@ -122,17 +124,16 @@ jobs: echo "changed=true" >> $GITHUB_OUTPUT fi - name: Create kind ${{ matrix.k8s }} cluster - uses: helm/kind-action@v1.8.0 + uses: helm/kind-action@v1.14.0 with: - version: v0.20.0 + version: v0.31.0 config: .github/kind-config.yaml node_image: kindest/node:${{ matrix.k8s }} if: ${{ steps.list-changed.outputs.changed == 'true' }} - name: Deploy ingress controller if: ${{ steps.list-changed.outputs.changed == 'true' }} run: | - # upgrade after: https://github.com/kubernetes/ingress-nginx/issues/6245 - kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.0.3/deploy/static/provider/kind/deploy.yaml + kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.14.3/deploy/static/provider/kind/deploy.yaml - name: Wait for the ingress controller to become ready if: ${{ steps.list-changed.outputs.changed == 'true' }} run: | @@ -140,12 +141,6 @@ jobs: --for=condition=ready pod \ --selector=app.kubernetes.io/component=controller \ --timeout=180s - kubectl -n ingress-nginx set resources deployment ingress-nginx-controller --requests=cpu=0 - sleep 120 - kubectl wait --namespace ingress-nginx \ - --for=condition=ready pod \ - --selector=app.kubernetes.io/component=controller \ - --timeout=180s - if: ${{ failure() }} run: | kubectl -n ingress-nginx describe pods @@ -157,6 +152,7 @@ jobs: if: ${{ steps.list-changed.outputs.changed == 'true' }} run: .github/chart-ci-init.sh - name: Run chart-testing (install) + if: ${{ steps.list-changed.outputs.changed == 'true' }} run: ct install ${{ env.CONFIG_OPTION_CHART_TESTING }} - name: Record logs