Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
111f31f
Remove container engine specfic directory name
cjh1 Jan 25, 2023
fd3f38b
Allow container engine to be configured
cjh1 Jan 25, 2023
36e3327
Ensure host directories exist
cjh1 Jan 30, 2023
63709e5
Add Containerfile for rootless podman container
cjh1 Jan 25, 2023
43e01d4
Adding Containerfile for building podman gpu compute_worker image
Feb 6, 2023
fd7d38f
Adding nvidia runtime config
Feb 6, 2023
3c27d79
Fix issue with shadow-utils
cjh1 Feb 14, 2023
d134c1e
Remove sudo
cjh1 Feb 14, 2023
da51014
Merge pull request #763 from cjh1/podman
Didayolo Feb 15, 2023
beb35f3
Merge pull request #775 from cjh1/podman
Didayolo Feb 16, 2023
c83e043
Optimize Containerfile in order to reduce image size
Mar 29, 2023
abf66f1
podman import error during tests
bbearce Mar 29, 2023
8327ff6
Add checkbox for terms and conditions
Didayolo Mar 29, 2023
1dbc65c
Merge branch 'podman' of https://github.com/codalab/codabench into po…
bbearce Mar 29, 2023
a463a0f
flake error
bbearce Mar 29, 2023
32249a3
Revert file
Mar 29, 2023
ee35b6d
Merge pull request #791 from codalab/signup-terms
Didayolo Mar 29, 2023
1cd06bf
Merge pull request #772 from codalab/podman
Didayolo Mar 29, 2023
64d371d
detailed competition results added
Apr 3, 2023
c799548
circle ci feedabck addressed, docker-compose file reverted to origina…
Apr 4, 2023
1038d55
white spaces removed to pass circle ci tests
Apr 4, 2023
2a4c632
Update test_competitions.py
Didayolo Apr 4, 2023
2d736c3
change cdn from unpkg to cdn.jsdeliver
bbearce Apr 4, 2023
cb128e8
Merge pull request #798 from codalab/easymde_update
Didayolo Apr 4, 2023
b400b5d
Update pull_request_template.md
Didayolo Apr 4, 2023
0dee7d8
Merge branch 'develop' into submission_detailed_results
Didayolo Apr 5, 2023
217ad05
Update test_submissions.py
Didayolo Apr 5, 2023
d621565
Merge branch 'develop' into submission_detailed_results
Didayolo Apr 5, 2023
23b9ade
Update test_submissions.py
Didayolo Apr 5, 2023
9325e3f
Merge pull request #797 from codalab/submission_detailed_results
Didayolo Apr 5, 2023
8f33e79
Add condition
Didayolo Apr 5, 2023
c499a8c
detailed results iframe width increased to full page width
Apr 7, 2023
f9ea7a4
if else fi not terminated with ;
bbearce Apr 11, 2023
b02a81e
password_reset
bbearce Apr 11, 2023
d8b1c0d
forgot to add the forgot password link
bbearce Apr 11, 2023
80a4feb
flask error
bbearce Apr 11, 2023
c8de3ae
more flake errors
bbearce Apr 11, 2023
953083b
correct email
bbearce Apr 11, 2023
1732a3c
Change default docker image to py37
Didayolo Apr 13, 2023
6e3d8e8
saving my place
bbearce Apr 13, 2023
67e4395
print to django logs
bbearce Apr 18, 2023
8abc387
final design change
bbearce Apr 18, 2023
6dad0c4
flake formatting errors
bbearce Apr 18, 2023
9653291
Merge pull request #803 from codalab/submission_detailed_results
Didayolo Apr 19, 2023
39f986e
Merge pull request #807 from codalab/default-docker
Didayolo Apr 19, 2023
7de11ea
Merge pull request #804 from codalab/forgot_change_password
Didayolo Apr 19, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@ more information is better.
...


# Known issues to be addressed in a separate PR
...


# A checklist for hand testing
- [ ] add checklist here
Expand All @@ -26,15 +23,12 @@ more information is better.
[link]('#') to any relevant files (or drag and drop into github)


# Misc. comments
...


# Checklist
- [ ] Code review by me
- [ ] Hand tested by me
- [ ] I'm proud of my work
- [ ] Code review by reviewer
- [ ] Hand tested by reviewer
- [ ] CircleCi tests are passing
- [ ] Ready to merge

3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ docker-compose.override.yml
server_config.yaml
/graphs/
/codabench/

.DS_Store
.DS_Store?
65 changes: 65 additions & 0 deletions Containerfile.compute_worker_podman
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
FROM fedora:37

# Include deps
RUN dnf -y update && \
# https://bugzilla.redhat.com/show_bug.cgi?id=1995337#c3
rpm --setcaps shadow-utils 2>/dev/null && \
dnf -y install podman fuse-overlayfs python3.8 python3-pip \
--exclude container-selinux && \
dnf clean all && \
rm -rf /var/cache /var/log/dnf* /var/log/yum.*

# Setup user
RUN useradd worker; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subuid; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subgid;

# Copy over the podman container configuration
COPY podman/containers.conf /etc/containers/containers.conf
COPY podman/worker-containers.conf /home/worker/.config/containers/containers.conf

# Copy over the podman storage configuration
COPY podman/worker-storage.conf /home/worker/.config/containers/storage.conf

RUN mkdir -p /home/worker/.local/share/containers && \
chown worker:worker -R /home/worker && \
chmod 644 /etc/containers/containers.conf

# Copy & modify the defaults to provide reference if runtime changes needed.
# Changes here are required for running with fuse-overlay storage inside container.
RUN sed -e 's|^#mount_program|mount_program|g' \
-e '/additionalimage.*/a "/var/lib/shared",' \
-e 's|^mountopt[[:space:]]*=.*$|mountopt = "nodev,fsync=0"|g' \
/usr/share/containers/storage.conf \
> /etc/containers/storage.conf

# Add volume for containers
VOLUME /home/worker/.local/share/containers

# Create directory for tmp space
RUN mkdir /codabench && \
chown worker:worker /codabench

# Set up podman registry for dockerhub
RUN echo -e "[registries.search]\nregistries = ['docker.io']\n" > /etc/containers/registries.conf

# This makes output not buffer and return immediately, nice for seeing results in stdout
ENV PYTHONUNBUFFERED 1
ENV CONTAINER_ENGINE_EXECUTABLE podman

# Get pip for 3.8
RUN python3.8 -m ensurepip --upgrade

WORKDIR /home/worker/compute_worker

ADD compute_worker/ /home/worker/compute_worker

RUN chown worker:worker -R /home/worker/compute_worker

RUN pip3.8 install -r /home/worker/compute_worker/compute_worker_requirements.txt

CMD celery -A compute_worker worker \
-l info \
-Q compute-worker \
-n compute-worker@%n \
--concurrency=1
66 changes: 66 additions & 0 deletions Containerfile.compute_worker_podman_gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
FROM fedora:37

# Include deps
RUN curl -s -L https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo | tee /etc/yum.repos.d/cuda.repo && \
curl -s -L https://nvidia.github.io/nvidia-docker/rhel9.0/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo && \
rpm -Uvh http://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm && \
rpm -Uvh http://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm && \
dnf -y update && \
dnf module install -y nvidia-driver:latest-dkms && \
dnf -y install podman fuse-overlayfs python3.8 python3-pip nvidia-container-runtime nvidia-container-toolkit \
cuda --exclude container-selinux && \
dnf clean all && \
rm -rf /var/cache /var/log/dnf* /var/log/yum.*

# Setup user
RUN useradd worker; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subuid; \
echo -e "worker:1:999\nworker:1001:64535" > /etc/subgid;

# Copy over the podman container configuration
COPY podman/containers.conf /etc/containers/containers.conf
COPY podman/worker-containers.conf /home/worker/.config/containers/containers.conf

# Copy over the podman storage configuration
COPY podman/worker-storage.conf /home/worker/.config/containers/storage.conf

RUN mkdir -p /home/worker/.local/share/containers && \
chown worker:worker -R /home/worker && \
chmod 644 /etc/containers/containers.conf

# Copy & modify the defaults to provide reference if runtime changes needed.
# Changes here are required for running with fuse-overlay storage inside container.
RUN sed -e 's|^#mount_program|mount_program|g' \
-e '/additionalimage.*/a "/var/lib/shared",' \
-e 's|^mountopt[[:space:]]*=.*$|mountopt = "nodev,fsync=0"|g' \
/usr/share/containers/storage.conf \
> /etc/containers/storage.conf; sed -i 's/^#no-cgroups = false/no-cgroups = true/;' /etc/nvidia-container-runtime/config.toml


# Add volume for containers
VOLUME /home/worker/.local/share/containers

# This makes output not buffer and return immediately, nice for seeing results in stdout
ENV PYTHONUNBUFFERED 1
ENV CONTAINER_ENGINE_EXECUTABLE podman

# Create directory for tmp space
RUN mkdir /codabench && \
chown worker:worker /codabench && \
# Set up podman registry for dockerhub
echo -e "[registries.search]\nregistries = ['docker.io']\n" > /etc/containers/registries.conf && \
# Get pip for 3.8
python3.8 -m ensurepip --upgrade

WORKDIR /home/worker/compute_worker

ADD compute_worker/ /home/worker/compute_worker

RUN chown worker:worker -R /home/worker/compute_worker && \
pip3.8 install -r /home/worker/compute_worker/compute_worker_requirements.txt

CMD nvidia-smi && celery -A compute_worker worker \
-l info \
-Q compute-worker \
-n compute-worker@%n \
--concurrency=1
4 changes: 2 additions & 2 deletions Dockerfile.compute_worker
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ ENV PYTHONUNBUFFERED 1
# Install Docker
RUN apt-get update && curl -fsSL https://get.docker.com | sh

ADD docker/compute_worker/compute_worker_requirements.txt .
ADD compute_worker/compute_worker_requirements.txt .
RUN pip install -r compute_worker_requirements.txt

ADD docker/compute_worker .
ADD compute_worker .

CMD celery -A compute_worker worker \
-l info \
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.compute_worker_gpu
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ RUN apt-get update && apt-get install -y nvidia-docker2
ENV NVIDIA_DOCKER 1

# Python reqs and actual worker stuff
ADD docker/compute_worker/compute_worker_requirements.txt .
ADD compute_worker/compute_worker_requirements.txt .
RUN pip3 install -r compute_worker_requirements.txt
ADD docker/compute_worker .
ADD compute_worker .

CMD celery -A compute_worker worker \
-l info \
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@
STATUS_FAILED,
)

# Setup the container engine that we are using
if os.environ.get("CONTAINER_ENGINE_EXECUTABLE"):
CONTAINER_ENGINE_EXECUTABLE = os.environ.get("CONTAINER_ENGINE_EXECUTABLE")
# We could probably depreciate this now that we can specify the executable
elif os.environ.get("NVIDIA_DOCKER"):
CONTAINER_ENGINE_EXECUTABLE = "nvidia-docker"
else:
CONTAINER_ENGINE_EXECUTABLE = "docker"

class SubmissionException(Exception):
pass
Expand Down Expand Up @@ -181,7 +189,7 @@ def __init__(self, run_args):
self.user_pk = run_args["user_pk"]
self.submission_id = run_args["id"]
self.submissions_api_url = run_args["submissions_api_url"]
self.docker_image = run_args["docker_image"]
self.container_image = run_args["docker_image"]
self.secret = run_args["secret"]
self.prediction_result = run_args["prediction_result"]
self.scoring_result = run_args.get("scoring_result")
Expand Down Expand Up @@ -221,7 +229,7 @@ def __init__(self, run_args):
self.requests_session.mount('https://', adapter)

async def watch_detailed_results(self):
"""Watches files alongside scoring + program docker containers, currently only used
"""Watches files alongside scoring + program containers, currently only used
for detailed_results.html"""
if not self.detailed_results_url:
return
Expand Down Expand Up @@ -314,15 +322,15 @@ def _update_status(self, status, extra_information=None):
# })
self._update_submission(data)

def _get_docker_image(self, image_name):
logger.info("Running docker pull for image: {}".format(image_name))
def _get_container_image(self, image_name):
logger.info("Running pull for image: {}".format(image_name))
try:
cmd = ['docker', 'pull', image_name]
docker_pull = check_output(cmd)
logger.info("Docker pull complete for image: {0} with output of {1}".format(image_name, docker_pull))
cmd = [CONTAINER_ENGINE_EXECUTABLE, 'pull', image_name]
container_engine_pull = check_output(cmd)
logger.info("Pull complete for image: {0} with output of {1}".format(image_name, container_engine_pull))
except CalledProcessError:
logger.info("Docker pull for image: {} returned a non-zero exit code!")
raise SubmissionException(f"Docker pull for {image_name} failed!")
logger.info("Pull for image: {} returned a non-zero exit code!")
raise SubmissionException(f"Pull for {image_name} failed!")

def _get_bundle(self, url, destination, cache=True):
"""Downloads zip from url and unzips into destination. If cache=True then url is hashed and checked
Expand Down Expand Up @@ -357,17 +365,17 @@ def _get_bundle(self, url, destination, cache=True):
# Give back zip file path for other uses, i.e. md5'ing the zip to ID it
return bundle_file

async def _run_docker_cmd(self, docker_cmd, kind):
async def _run_container_engine_cmd(self, engine_cmd, kind):
"""This runs a command and asynchronously writes the data to both a storage file
and a socket

:param docker_cmd: the list of docker command arguments
:param engine_cmd: the list of container engine command arguments
:param kind: either 'ingestion' or 'program'
:return:
"""
start = time.time()
proc = await asyncio.create_subprocess_exec(
*docker_cmd,
*engine_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
Expand Down Expand Up @@ -442,17 +450,23 @@ async def _run_docker_cmd(self, docker_cmd, kind):
await websocket.close()

def _get_host_path(self, *paths):
"""Turns an absolute path inside our docker container, into what the path
would be on the host machine"""
"""Turns an absolute path inside our container, into what the path
would be on the host machine. We also ensure that the directory exists,
docker will create if necessary, but other container engines such as
podman may not."""
# Take our list of paths and smash 'em together
path = os.path.join(*paths)

# pull front of path, which points to the location inside docker
# pull front of path, which points to the location inside the container
path = path[len(BASE_DIR):]

# add host to front, so when we run commands in docker on the host they
# add host to front, so when we run commands in the container on the host they
# can be seen properly
path = os.path.join(HOST_DIRECTORY, path)

# Create if necessary
os.makedirs(path, exist_ok=True)

return path

async def _run_program_directory(self, program_dir, kind, can_be_output=False):
Expand Down Expand Up @@ -494,13 +508,8 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
)
return

if os.environ.get("NVIDIA_DOCKER"):
docker_process_name = "nvidia-docker"
else:
docker_process_name = "docker"

docker_cmd = [
docker_process_name,
engine_cmd = [
CONTAINER_ENGINE_EXECUTABLE,
'run',
# Remove it after run
'--rm',
Expand Down Expand Up @@ -528,21 +537,21 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
else:
ingested_program_location = "program"

docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, ingested_program_location)}:/app/ingested_program']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, ingested_program_location)}:/app/ingested_program']

if self.input_data:
docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, "input_data")}:/app/input_data']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, "input_data")}:/app/input_data']

if self.is_scoring:
# For scoring programs, we want to have a shared directory just in case we have an ingestion program.
# This will add the share dir regardless of ingestion or scoring, as long as we're `is_scoring`
docker_cmd += ['-v', f'{self._get_host_path(self.root_dir, "shared")}:/app/shared']
engine_cmd += ['-v', f'{self._get_host_path(self.root_dir, "shared")}:/app/shared']

# Input from submission (or submission + ingestion combo)
docker_cmd += ['-v', f'{self._get_host_path(self.input_dir)}:/app/input']
engine_cmd += ['-v', f'{self._get_host_path(self.input_dir)}:/app/input']

# Set the image name (i.e. "codalab/codalab-legacy") for the container
docker_cmd += [self.docker_image]
# Set the image name (i.e. "codalab/codalab-legacy:py37") for the container
engine_cmd += [self.container_image]

# Handle Legacy competitions by replacing anything in the run command
command = replace_legacy_metadata_command(
Expand All @@ -553,12 +562,12 @@ async def _run_program_directory(self, program_dir, kind, can_be_output=False):
)

# Append the actual program to run
docker_cmd += command.split(' ')
engine_cmd += command.split(' ')

logger.info(f"Running program = {' '.join(docker_cmd)}")
logger.info(f"Running program = {' '.join(engine_cmd)}")

# This runs the docker command and asynchronously passes data back via websocket
return await self._run_docker_cmd(docker_cmd, kind=kind)
# This runs the container engine command and asynchronously passes data back via websocket
return await self._run_container_engine_cmd(engine_cmd, kind=kind)

def _put_dir(self, url, directory):
logger.info("Putting dir %s in %s" % (directory, url))
Expand Down Expand Up @@ -649,9 +658,9 @@ def prepare(self):
for filename in glob.iglob(self.root_dir + '**/*.*', recursive=True):
logger.info(filename)

# Before the run starts we want to download docker images, they may take a while to download
# Before the run starts we want to download images, they may take a while to download
# and to do this during the run would subtract from the participants time.
self._get_docker_image(self.docker_image)
self._get_container_image(self.container_image)

def start(self):
if not self.is_scoring:
Expand Down Expand Up @@ -690,7 +699,7 @@ def start(self):
else:
program_to_kill = self.program_container_name
# Try and stop the program. If stop does not succeed
kill_code = subprocess.call(['docker', 'stop', str(program_to_kill)])
kill_code = subprocess.call([CONTAINER_ENGINE_EXECUTABLE, 'stop', str(program_to_kill)])
logger.info(f'Kill process returned {kill_code}')
if kind == 'program':
self.program_exit_code = return_code
Expand Down
Loading