From 856875a38f0f8fbb8fef7cada5f47d9648ecc980 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Fri, 23 Jan 2026 02:14:34 +0530 Subject: [PATCH] TEZ-4682: [Cloud] Tez AM docker image --- tez-dist/pom.xml | 32 +++++ tez-dist/src/docker/Dockerfile | 85 +++++++++++++ tez-dist/src/docker/build-docker.sh | 128 ++++++++++++++++++++ tez-dist/src/docker/conf/tez-site.xml | 52 ++++++++ tez-dist/src/docker/entrypoint.sh | 164 ++++++++++++++++++++++++++ 5 files changed, 461 insertions(+) create mode 100644 tez-dist/src/docker/Dockerfile create mode 100755 tez-dist/src/docker/build-docker.sh create mode 100644 tez-dist/src/docker/conf/tez-site.xml create mode 100644 tez-dist/src/docker/entrypoint.sh diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 9777d0c0b9..31dae3a28e 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -118,6 +118,38 @@ + + docker + + + + org.codehaus.mojo + exec-maven-plugin + + + build-docker-image + package + + exec + + + /bin/bash + + ${project.basedir}/src/docker/build-docker.sh + -hadoop + ${hadoop.version} + -tez + ${project.version} + -repo + apache + + + + + + + + diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile new file mode 100644 index 0000000000..b3b8821544 --- /dev/null +++ b/tez-dist/src/docker/Dockerfile @@ -0,0 +1,85 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG BUILD_ENV=unarchive + +FROM ubuntu AS unarchive +ONBUILD COPY hadoop-*.tar.gz /opt +# UPDATED: Matches "tez-1.0.0-SNAPSHOT.tar.gz" pattern +ONBUILD COPY tez-*.tar.gz /opt + +FROM ${BUILD_ENV} AS env +ARG HADOOP_VERSION +ARG TEZ_VERSION + +RUN mkdir -p /opt/hadoop \ + && tar -xzv \ + --exclude="hadoop-$HADOOP_VERSION/share/doc" \ + --exclude="*/jdiff" \ + --exclude="*/sources" \ + --exclude="*tests.jar" \ + --exclude="*/webapps" \ + -f /opt/hadoop-$HADOOP_VERSION.tar.gz \ + -C /opt/hadoop --strip-components 1 \ + && mkdir -p /opt/tez \ + && tar -xzv \ + -f /opt/tez-$TEZ_VERSION.tar.gz \ + -C /opt/tez \ + && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz + +FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run + +ARG UID=1000 +ARG HADOOP_VERSION +ARG TEZ_VERSION + +# Install dependencies +RUN set -ex; \ + microdnf update -y; \ + microdnf -y install procps gettext findutils; \ + microdnf clean all; \ + useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez + +# Set necessary environment variables +ENV HADOOP_HOME=/opt/hadoop \ + TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf \ + HADOOP_CONF_DIR=/opt/tez/conf + +ENV PATH=$TEZ_HOME/bin:$HADOOP_HOME/bin:$PATH + +COPY --from=env --chown=tez /opt/hadoop $HADOOP_HOME +# UPDATED: Copy from the normalized directory name created in 'env' stage +COPY --from=env --chown=tez /opt/tez $TEZ_HOME + +RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR + +COPY --chown=tez entrypoint.sh / +COPY --chown=tez conf $TEZ_CONF_DIR + +# Create Extension Point Directory +RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins + +RUN chmod +x /entrypoint.sh + +USER tez +WORKDIR $TEZ_HOME + +# Expose AM RPC Ports +EXPOSE 10001 10002 10003 8042 2181 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh new file mode 100755 index 0000000000..fabe94ed77 --- /dev/null +++ b/tez-dist/src/docker/build-docker.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +HADOOP_VERSION= +TEZ_VERSION= +REPO= + +usage() { + cat <&2 +Usage: $0 [-h] [-hadoop ] [-tez ] [-repo ] +Build the Apache Tez AM Docker image +-help Display help +-hadoop Build image with the specified Hadoop version +-tez Build image with the specified Tez version +-repo Docker repository +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + -h) + usage + exit 0 + ;; + -hadoop) + shift + HADOOP_VERSION=$1 + shift + ;; + -tez) + shift + TEZ_VERSION=$1 + shift + ;; + -repo) + shift + REPO=$1 + shift + ;; + *) + shift + ;; + esac +done + +SCRIPT_DIR=$( + cd "$(dirname "$0")" + pwd +) + +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} + +repo=${REPO:-apache} +WORK_DIR="$(mktemp -d)" +CACHE_DIR="$SCRIPT_DIR/cache" +mkdir -p "$CACHE_DIR" + +# Defaults Hadoop and Tez versions from pom.xml if not provided +HADOOP_VERSION=${HADOOP_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=hadoop.version -DforceStdout)} +TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} + +###################### +# HADOOP FETCH LOGIC # +###################### +HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" +HADOOP_URL=${HADOOP_URL:-"https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} +if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then + echo "Downloading Hadoop from $HADOOP_URL..." + if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then + echo "Fail to download Hadoop, exiting...." + exit 1 + fi + mv "$CACHE_DIR/$HADOOP_FILE_NAME.tmp" "$CACHE_DIR/$HADOOP_FILE_NAME" +fi + +##################################### +# Pick tez tarball from local build # +##################################### +TEZ_FILE_NAME="tez-$TEZ_VERSION.tar.gz" +LOCAL_DIST_PATH="$DIST_DIR/target/$TEZ_FILE_NAME" + +if [ -f "$LOCAL_DIST_PATH" ]; then + echo "--> Found local Tez build artifact at: $LOCAL_DIST_PATH" + cp "$LOCAL_DIST_PATH" "$WORK_DIR/" +else + echo "--> Error: Local Tez artifact not found at $LOCAL_DIST_PATH" + echo "--> Please build the project first (e.g., mvn clean install -DskipTests)." + exit 1 +fi + +# ------------------------------------------------------------------------- +# BUILD CONTEXT PREPARATION +# ------------------------------------------------------------------------- +cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" +cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" +cp "$SCRIPT_DIR/entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" + +echo "Building Docker image..." +docker build \ + "$WORK_DIR" \ + -f "$WORK_DIR/Dockerfile" \ + -t "$repo/tez-am:$TEZ_VERSION" \ + --build-arg "BUILD_ENV=unarchive" \ + --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ + --build-arg "TEZ_VERSION=$TEZ_VERSION" + +rm -r "${WORK_DIR}" +echo "Docker image $repo/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml new file mode 100644 index 0000000000..b76fcf026f --- /dev/null +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -0,0 +1,52 @@ + + + + + + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.resource.memory.mb + 1024 + + + + tez.framework.mode + STANDALONE_ZOOKEEPER + + + + tez.am.zookeeper.quorum + host.docker.internal:2181 + + + + tez.am.log.level + DEBUG + + + + tez.am.mode.session + true + + + + diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/entrypoint.sh new file mode 100644 index 0000000000..65617f743b --- /dev/null +++ b/tez-dist/src/docker/entrypoint.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + + ####################### + # 1. PLUGIN EXTENSION # + ####################### +# The directory /opt/tez/plugins is intended to be a volume mount point. +# If custom jars are present, we add them to classpath. + +PLUGIN_DIR="/opt/tez/plugins" +PLUGIN_CLASSPATH="" + +if [ -d "$PLUGIN_DIR" ]; then + count=$(find "$PLUGIN_DIR" -maxdepth 1 -name "*.jar" 2>/dev/null | wc -l) + if [ "$count" != "0" ]; then + echo "--> Found $count custom jars in $PLUGIN_DIR. Adding to classpath..." + PLUGIN_CLASSPATH="$PLUGIN_DIR/*" + else + echo "--> Plugin directory exists but contains no jars." + fi +fi + +# ========================================================================= +# 2. CONFIGURATION HANDLING +# ========================================================================= +# 1. Custom Conf Dir: If mounted, symlink it to use it directly. +# 2. Templates: If not custom, use envsubst to generate configs from ENV. + +# Point HADOOP_CONF_DIR to TEZ_CONF_DIR, we need to populate it +# with defaults from the Hadoop installation if they aren't provided by the user. +if [ -d "$HADOOP_HOME/etc/hadoop" ]; then + echo "--> Linking missing Hadoop configs to $TEZ_CONF_DIR..." + for f in "$HADOOP_HOME/etc/hadoop"/*; do + basename=$(basename "$f") + # this check helps in case user wants to provide its custom hfds-site.xml + # or any other configuration file + if [ ! -e "$TEZ_CONF_DIR/$basename" ]; then + ln -s "$f" "$TEZ_CONF_DIR/$basename" + fi + done +fi + +########################### +# Custom Config directory # +########################### +if [ -n "${TEZ_CUSTOM_CONF_DIR:-}" ] && [ -d "$TEZ_CUSTOM_CONF_DIR" ]; then + echo "--> Using custom configuration directory: $TEZ_CUSTOM_CONF_DIR" + find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ + ln -sfn {} "${TEZ_CONF_DIR}"/ \; +else + echo "--> Generating configuration from templates..." + # Set defaults for template variables if not provided + export TEZ_AM_RPC_PORT=${TEZ_AM_RPC_PORT:-10001} + export TEZ_AM_RESOURCE_MEMORY=${TEZ_AM_RESOURCE_MEMORY:-1024} + + # Process templates + if [ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]; then + envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" + fi +fi + + +#################### +# Find TEZ DAG JAR # +#################### +TEZ_DAG_JAR=$(find "$TEZ_HOME" -maxdepth 1 -name "tez-dag-*.jar" ! -name "*-tests.jar" | head -n 1) + +if [ -z "$TEZ_DAG_JAR" ]; then + echo "Error: Could not find tez-dag-*.jar in $TEZ_HOME" + ls -l "$TEZ_HOME" + exit 1 +fi + +############################################## +# YARN ENVIRONMENT SIMULATION () # +############################################## +export APP_SUBMIT_TIME_ENV=${APP_SUBMIT_TIME_ENV:-$(($(date +%s) * 1000))} + +# 2. Container ID +export CONTAINER_ID=${CONTAINER_ID:-"container_1700000000000_0001_01_000001"} + +# 3. NodeManager Details +export NM_HOST=${NM_HOST:-"localhost"} +export NM_PORT=${NM_PORT:-"12345"} +export NM_HTTP_PORT=${NM_HTTP_PORT:-"8042"} +export LOCAL_DIRS=${LOCAL_DIRS:-"/tmp"} +export LOG_DIRS=${LOG_DIRS:-"/opt/tez/logs"} + +# 4. User Identity +export HADOOP_USER_NAME=${HADOOP_USER_NAME:-"tez"} +export USER=${HADOOP_USER_NAME} + +export TEZ_AM_EXTERNAL_ID=${TEZ_AM_EXTERNAL_ID:-"tez-session-$(hostname)"} + +echo "--> Mocked YARN Environment:" +echo " APP_SUBMIT_TIME_ENV: $APP_SUBMIT_TIME_ENV" +echo " CONTAINER_ID: $CONTAINER_ID" +echo " USER: $USER" + +mkdir -p "$LOG_DIRS" + +if [ ! -f "tez-conf.pb" ]; then + touch "tez-conf.pb" + echo "--> Created dummy tez-conf.pb" +fi + +############# +# EXECUTION # +############# + +CLASSPATH="${TEZ_CONF_DIR}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" + +if [ -n "$PLUGIN_CLASSPATH" ]; then + CLASSPATH="${CLASSPATH}:${PLUGIN_CLASSPATH}" +fi + +export HADOOP_USER_CLASSPATH_FIRST=true + +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*" + +echo "--> Starting DAGAppMaster with JAR: $TEZ_DAG_JAR" +echo "--> HADOOP_CONF_DIR: $HADOOP_CONF_DIR" + +exec java \ + --add-opens java.base/java.lang=ALL-UNNAMED \ + --add-opens java.base/java.util=ALL-UNNAMED \ + --add-opens java.base/java.lang.reflect=ALL-UNNAMED \ + --add-opens java.base/java.text=ALL-UNNAMED \ + --add-opens java.base/java.nio=ALL-UNNAMED \ + --add-opens java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens java.base/java.util.concurrent=ALL-UNNAMED \ + --add-opens java.base/java.util.concurrent.atomic=ALL-UNNAMED \ + -Duser.name="$HADOOP_USER_NAME" \ + -Djava.library.path="$HADOOP_HOME/lib/native" \ + -Dhadoop.home.dir="$HADOOP_HOME" \ + -Dhadoop.log.dir="$LOG_DIRS" \ + -Dtez.conf.dir="$TEZ_CONF_DIR" \ + -cp "$CLASSPATH" \ + org.apache.tez.dag.app.DAGAppMaster \ + "$@"