Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions Containerfile.compute_worker_podman
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
FROM fedora:37

# Include deps
RUN dnf -y update && \
# https://bugzilla.redhat.com/show_bug.cgi?id=1995337#c3
rpm --setcaps shadow-utils 2>/dev/null && \
dnf -y install podman fuse-overlayfs python3.8 python3-pip \
--exclude container-selinux && \
dnf clean all && \
rm -rf /var/cache /var/log/dnf* /var/log/yum.*

# Setup user
RUN useradd worker; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subuid; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subgid;

# Copy over the podman container configuration
COPY podman/containers.conf /etc/containers/containers.conf
COPY podman/worker-containers.conf /home/worker/.config/containers/containers.conf

# Copy over the podman storage configuration
COPY podman/worker-storage.conf /home/worker/.config/containers/storage.conf

RUN mkdir -p /home/worker/.local/share/containers && \
chown worker:worker -R /home/worker && \
chmod 644 /etc/containers/containers.conf

# Copy & modify the defaults to provide reference if runtime changes needed.
# Changes here are required for running with fuse-overlay storage inside container.
RUN sed -e 's|^#mount_program|mount_program|g' \
-e '/additionalimage.*/a "/var/lib/shared",' \
-e 's|^mountopt[[:space:]]*=.*$|mountopt = "nodev,fsync=0"|g' \
/usr/share/containers/storage.conf \
> /etc/containers/storage.conf

# Add volume for containers
VOLUME /home/worker/.local/share/containers

# Create directory for tmp space
RUN mkdir /codabench && \
chown worker:worker /codabench

# Set up podman registry for dockerhub
RUN echo -e "[registries.search]\nregistries = ['docker.io']\n" > /etc/containers/registries.conf

# This makes output not buffer and return immediately, nice for seeing results in stdout
ENV PYTHONUNBUFFERED 1
ENV CONTAINER_ENGINE_EXECUTABLE podman

# Get pip for 3.8
RUN python3.8 -m ensurepip --upgrade

WORKDIR /home/worker/compute_worker

ADD compute_worker/ /home/worker/compute_worker

RUN chown worker:worker -R /home/worker/compute_worker

RUN pip3.8 install -r /home/worker/compute_worker/compute_worker_requirements.txt

CMD celery -A compute_worker worker \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dtuantran How much space does this save? Just wondering if it's worth it given the loss of readability and ease of development, by having multiple steps.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't save much space, sorry that I push it too quick, I've reverted back this file.

-l info \
-Q compute-worker \
-n compute-worker@%n \
--concurrency=1
66 changes: 66 additions & 0 deletions Containerfile.compute_worker_podman_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
FROM fedora:37

# Include deps
RUN curl -s -L https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo | tee /etc/yum.repos.d/cuda.repo && \
curl -s -L https://nvidia.github.io/nvidia-docker/rhel9.0/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo && \
rpm -Uvh http://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm && \
rpm -Uvh http://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm && \
dnf -y update && \
dnf module install -y nvidia-driver:latest-dkms && \
dnf -y install podman fuse-overlayfs python3.8 python3-pip nvidia-container-runtime nvidia-container-toolkit \
cuda --exclude container-selinux && \
dnf clean all && \
rm -rf /var/cache /var/log/dnf* /var/log/yum.*

# Setup user
RUN useradd worker; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subuid; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subgid;

# Copy over the podman container configuration
COPY podman/containers.conf /etc/containers/containers.conf
COPY podman/worker-containers.conf /home/worker/.config/containers/containers.conf

# Copy over the podman storage configuration
COPY podman/worker-storage.conf /home/worker/.config/containers/storage.conf

RUN mkdir -p /home/worker/.local/share/containers && \
chown worker:worker -R /home/worker && \
chmod 644 /etc/containers/containers.conf

# Copy & modify the defaults to provide reference if runtime changes needed.
# Changes here are required for running with fuse-overlay storage inside container.
RUN sed -e 's|^#mount_program|mount_program|g' \
-e '/additionalimage.*/a "/var/lib/shared",' \
-e 's|^mountopt[[:space:]]*=.*$|mountopt = "nodev,fsync=0"|g' \
/usr/share/containers/storage.conf \
> /etc/containers/storage.conf; sed -i 's/^#no-cgroups = false/no-cgroups = true/;' /etc/nvidia-container-runtime/config.toml


# Add volume for containers
VOLUME /home/worker/.local/share/containers

# This makes output not buffer and return immediately, nice for seeing results in stdout
ENV PYTHONUNBUFFERED 1
ENV CONTAINER_ENGINE_EXECUTABLE podman

# Create directory for tmp space
RUN mkdir /codabench && \
chown worker:worker /codabench && \
# Set up podman registry for dockerhub
echo -e "[registries.search]\nregistries = ['docker.io']\n" > /etc/containers/registries.conf && \
# Get pip for 3.8
python3.8 -m ensurepip --upgrade

WORKDIR /home/worker/compute_worker

ADD compute_worker/ /home/worker/compute_worker

RUN chown worker:worker -R /home/worker/compute_worker && \
pip3.8 install -r /home/worker/compute_worker/compute_worker_requirements.txt

CMD nvidia-smi && celery -A compute_worker worker \
-l info \
-Q compute-worker \
-n compute-worker@%n \
--concurrency=1
4 changes: 2 additions & 2 deletions Dockerfile.compute_worker
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ ENV PYTHONUNBUFFERED 1
# Install Docker
RUN apt-get update && curl -fsSL https://get.docker.com | sh

ADD docker/compute_worker/compute_worker_requirements.txt .
ADD compute_worker/compute_worker_requirements.txt .
RUN pip install -r compute_worker_requirements.txt

ADD docker/compute_worker .
ADD compute_worker .

CMD celery -A compute_worker worker \
-l info \
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.compute_worker_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ RUN apt-get update && apt-get install -y nvidia-docker2
ENV NVIDIA_DOCKER 1

# Python reqs and actual worker stuff
ADD docker/compute_worker/compute_worker_requirements.txt .
ADD compute_worker/compute_worker_requirements.txt .
RUN pip3 install -r compute_worker_requirements.txt
ADD docker/compute_worker .
ADD compute_worker .

CMD celery -A compute_worker worker \
-l info \
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@
STATUS_FAILED,
)

# Setup the container engine that we are using
if os.environ.get("CONTAINER_ENGINE_EXECUTABLE"):
CONTAINER_ENGINE_EXECUTABLE = os.environ.get("CONTAINER_ENGINE_EXECUTABLE")
# We could probably depreciate this now that we can specify the executable
elif os.environ.get("NVIDIA_DOCKER"):
CONTAINER_ENGINE_EXECUTABLE = "nvidia-docker"
else:
CONTAINER_ENGINE_EXECUTABLE = "docker"

class SubmissionException(Exception):
pass
Expand Down Expand Up @@ -181,7 +189,7 @@ def __init__(self, run_args):
self.user_pk = run_args["user_pk"]
self.submission_id = run_args["id"]
self.submissions_api_url = run_args["submissions_api_url"]
self.docker_image = run_args["docker_image"]
self.container_image = run_args["docker_image"]
self.secret = run_args["secret"]
self.prediction_result = run_args["prediction_result"]
self.scoring_result = run_args.get("scoring_result")
Expand Down Expand Up @@ -221,7 +229,7 @@ def __init__(self, run_args):
self.requests_session.mount('https://', adapter)

async def watch_detailed_results(self):
"""Watches files alongside scoring + program docker containers, currently only used
"""Watches files alongside scoring + program containers, currently only used
for detailed_results.html"""
if not self.detailed_results_url:
return
Expand Down Expand Up @@ -314,15 +322,15 @@ def _update_status(self, status, extra_information=None):
# })
self._update_submission(data)

def _get_docker_image(self, image_name):
logger.info("Running docker pull for image: {}".format(image_name))
def _get_container_image(self, image_name):
logger.info("Running pull for image: {}".format(image_name))
try:
cmd = ['docker', 'pull', image_name]
docker_pull = check_output(cmd)
logger.info("Docker pull complete for image: {0} with output of {1}".format(image_name, docker_pull))
cmd = [CONTAINER_ENGINE_EXECUTABLE, 'pull', image_name]
container_engine_pull = check_output(cmd)
logger.info("Pull complete for image: {0} with output of {1}".format(image_name, container_engine_pull))
except CalledProcessError:
logger.info("Docker pull for image: {} returned a non-zero exit code!")
raise SubmissionException(f"Docker pull for {image_name} failed!")
logger.info("Pull for image: {} returned a non-zero exit code!")
raise SubmissionException(f"Pull for {image_name} failed!")

def _get_bundle(self, url, destination, cache=True):
"""Downloads zip from url and unzips into destination. If cache=True then url is hashed and checked
Expand Down Expand Up @@ -357,17 +365,17 @@ def _get_bundle(self, url, destination, cache=True):
# Give back zip file path for other uses, i.e. md5'ing the zip to ID it
return bundle_file

async def _run_docker_cmd(self, docker_cmd, kind):
async def _run_container_engine_cmd(self, engine_cmd, kind):
"""This runs a command and asynchronously writes the data to both a storage file
and a socket

:param docker_cmd: the list of docker command arguments
:param engine_cmd: the list of container engine command arguments
:param kind: either 'ingestion' or 'program'
:return:
"""
start = time.time()
proc = await asyncio.create_subprocess_exec(
*docker_cmd,
*engine_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
Expand Down Expand Up @@ -442,17 +450,23 @@ async def _run_docker_cmd(self, docker_cmd, kind):
await websocket.close()

def _get_host_path(self, *paths):
"""Turns an absolute path inside our docker container, into what the path
would be on the host machine"""
"""Turns an absolute path inside our container, into what the path
would be on the host machine. We also ensure that the directory exists,
docker will create if necessary, but other container engines such as
podman may not."""
# Take our list of paths and smash 'em together
path = os.path.join(*paths)

# pull front of path, which points to the location inside docker
# pull front of path, which points to the location inside the container
path = path[len(BASE_DIR):]

# add host to front, so when we run commands in docker on the host they
# add host to front, so when we run commands in the container on the host they
# can be seen properly
path = os.path.join(HOST_DIRECTORY, path)

# Create if necessary
os.makedirs(path, exist_ok=True)

return path

async def _run_program_directory(self, program_dir, kind, can_be_output=False):
Expand Down Expand Up @@ -494,13 +508,8 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
)
return

if os.environ.get("NVIDIA_DOCKER"):
docker_process_name = "nvidia-docker"
else:
docker_process_name = "docker"

docker_cmd = [
docker_process_name,
engine_cmd = [
CONTAINER_ENGINE_EXECUTABLE,
'run',
# Remove it after run
'--rm',
Expand Down Expand Up @@ -528,21 +537,21 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
else:
ingested_program_location = "program"

docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, ingested_program_location)}:/app/ingested_program']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, ingested_program_location)}:/app/ingested_program']

if self.input_data:
docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, "input_data")}:/app/input_data']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, "input_data")}:/app/input_data']

if self.is_scoring:
# For scoring programs, we want to have a shared directory just in case we have an ingestion program.
# This will add the share dir regardless of ingestion or scoring, as long as we're `is_scoring`
docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, "shared")}:/app/shared']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, "shared")}:/app/shared']

# Input from submission (or submission + ingestion combo)
docker_cmd += ['-v', f'{self._get_host_path(self.input_dir)}:/app/input']
engine_cmd += ['-v', f'{self._get_host_path(self.input_dir)}:/app/input']

# Set the image name (i.e. "codalab/codalab-legacy") for the container
docker_cmd += [self.docker_image]
engine_cmd += [self.container_image]

# Handle Legacy competitions by replacing anything in the run command
command = replace_legacy_metadata_command(
Expand All @@ -553,12 +562,12 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
)

# Append the actual program to run
docker_cmd += command.split(' ')
engine_cmd += command.split(' ')

logger.info(f"Running program = {' '.join(docker_cmd)}")
logger.info(f"Running program = {' '.join(engine_cmd)}")

# This runs the docker command and asynchronously passes data back via websocket
return await self._run_docker_cmd(docker_cmd, kind=kind)
# This runs the container engine command and asynchronously passes data back via websocket
return await self._run_container_engine_cmd(engine_cmd, kind=kind)

def _put_dir(self, url, directory):
logger.info("Putting dir %s in %s" % (directory, url))
Expand Down Expand Up @@ -649,9 +658,9 @@ def prepare(self):
for filename in glob.iglob(self.root_dir + '**/*.*', recursive=True):
logger.info(filename)

# Before the run starts we want to download docker images, they may take a while to download
# Before the run starts we want to download images, they may take a while to download
# and to do this during the run would subtract from the participants time.
self._get_docker_image(self.docker_image)
self._get_container_image(self.container_image)

def start(self):
if not self.is_scoring:
Expand Down Expand Up @@ -690,7 +699,7 @@ def start(self):
else:
program_to_kill = self.program_container_name
# Try and stop the program. If stop does not succeed
kill_code = subprocess.call(['docker', 'stop', str(program_to_kill)])
kill_code = subprocess.call([CONTAINER_ENGINE_EXECUTABLE, 'stop', str(program_to_kill)])
logger.info(f'Kill process returned {kill_code}')
if kind == 'program':
self.program_exit_code = return_code
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ services:
- django
- rabbit
volumes:
- ./docker/compute_worker:/app
- ./compute_worker:/app
- ${HOST_DIRECTORY:-/tmp/codabench}:/codabench
# Actual connection back to docker parent to run things
- /var/run/docker.sock:/var/run/docker.sock
Expand Down
12 changes: 12 additions & 0 deletions podman/containers.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[containers]
netns="host"
userns="host"
ipcns="host"
utsns="host"
cgroupns="host"
cgroups="disabled"
log_driver = "k8s-file"
[engine]
cgroup_manager = "cgroupfs"
events_logger="file"
runtime="crun"
5 changes: 5 additions & 0 deletions podman/worker-containers.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[containers]
volumes = [
"/proc:/proc",
]
default_sysctls = []
5 changes: 5 additions & 0 deletions podman/worker-storage.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[storage]
driver = "overlay"

[storage.options.overlay]
mount_program = "/usr/bin/fuse-overlayfs"
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.test import TestCase
from docker.compute_worker.compute_worker import replace_legacy_metadata_command
from compute_worker.compute_worker import replace_legacy_metadata_command


class LegacyConverterCommandTests(TestCase):
Expand Down