diff --git a/.github/workflows/build-push-docker-image.yml b/.github/workflows/build-push-docker-image.yml index d7629191..eec5676f 100644 --- a/.github/workflows/build-push-docker-image.yml +++ b/.github/workflows/build-push-docker-image.yml @@ -1,6 +1,8 @@ name: Build Docker images and push to Docker Hub on: + workflow_dispatch: + push: branches: - main @@ -9,6 +11,7 @@ on: paths: - 'task-runner/**' - 'file-tracker/**' + - 'unified/**' jobs: build-publish-docker-images: @@ -54,6 +57,20 @@ jobs: org.opencontainers.image.title=File Tracker org.opencontainers.image.description=File tracker for the Inductiva API + # Metadata for task-runner-unified + - name: Docker meta for task-runner-unified + id: meta-task-runner-unified + uses: docker/metadata-action@v5 + with: + images: inductiva/task-runner + tags: | + type=raw,value=unified_${{ github.ref_name }} + labels: | + org.opencontainers.image.url=https://inductiva.ai/ + org.opencontainers.image.source=${{ github.repository }} + org.opencontainers.image.title=Task Runner Unified + org.opencontainers.image.description=Task runner with file-tracker unified for the Inductiva API + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -85,3 +102,17 @@ jobs: build-args: | API_URL=${{ github.ref == 'refs/heads/main' && 'https://api.inductiva.ai' || 'https://api-dev.inductiva.ai' }} + # Build and push task-runner-unified + - name: Build and push task-runner-unified to Docker Hub + uses: docker/build-push-action@v5 + with: + context: . + file: ./unified/Dockerfile + platforms: linux/amd64 + push: true + tags: ${{ steps.meta-task-runner-unified.outputs.tags }} + labels: | + ${{ steps.meta-task-runner-unified.outputs.labels }} + build-args: | + API_URL=${{ github.ref == 'refs/heads/main' && 'https://api.inductiva.ai' || 'https://api-dev.inductiva.ai' }} + diff --git a/Makefile b/Makefile index 816d7d7b..29d587ec 100644 --- a/Makefile +++ b/Makefile @@ -23,6 +23,11 @@ DOCKER_COMPOSE_COMMAND_TASK_RUNNER_LITE=\ -p task-runner-lite-$(UID) \ -f docker-compose.lite.yml +DOCKER_COMPOSE_COMMAND_TASK_RUNNER_UNIFIED=\ + $(DOCKER_COMPOSE_COMMAND) \ + -p task-runner-unified-$(UID) \ + -f docker-compose.unified.yml + .PHONY: % %: help @@ -32,9 +37,11 @@ help: @echo " make task-runner-up: starts task-runner building from source" @echo " make task-runner-lite-up: starts task-runner in lite mode (faster)" @echo " make task-runner-cuda-up: starts task-runner with CUDA support" + @echo " make task-runner-unified-up: starts task-runner with file-tracker unified" @echo " make task-runner-down stops task-runner building from source" @echo " make task-runner-lite-down stops task-runner in lite mode" @echo " make task-runner-cuda-down stops task-runner with CUDA support" + @echo " make task-runner-unified-down stops task-runner with file-tracker unified" @echo Utils: @echo " make lint-fix: run linter and fix issues" @echo " make format: run formatter" @@ -53,6 +60,9 @@ task-runner-lite-up:setup-apptainer-folder task-runner-cuda-up:setup-apptainer-folder $(DOCKER_COMPOSE_COMMAND_TASK_RUNNER_CUDA) up --build +task-runner-unified-up:setup-apptainer-folder + $(DOCKER_COMPOSE_COMMAND_TASK_RUNNER_UNIFIED) up --build + task-runner-down: $(DOCKER_COMPOSE_COMMAND_TASK_RUNNER) down @@ -62,6 +72,9 @@ task-runner-lite-down: task-runner-cuda-down: $(DOCKER_COMPOSE_COMMAND_TASK_RUNNER_CUDA) down +task-runner-unified-down: + $(DOCKER_COMPOSE_COMMAND_TASK_RUNNER_UNIFIED) down + lint-fix: ruff check --config=./pyproject.toml --fix diff --git a/README.md b/README.md index af8ed664..61778f3b 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,13 @@ Build and run the Task Runner with CUDA support: make task-runner-cuda-up ``` +#### Unified mode +Build and run the Task Runner with file-tracker unified in a single container: + +``` +make task-runner-unified-up +``` + ### Run Simulations You can now run simulations locally by passing a local machine when you call the `run` function. Try out the following example: diff --git a/docker-compose.unified.yml b/docker-compose.unified.yml new file mode 100644 index 00000000..8b0f4bdc --- /dev/null +++ b/docker-compose.unified.yml @@ -0,0 +1,22 @@ +services: + task-runner-unified: + build: + context: . + dockerfile: unified/Dockerfile + environment: + EXECUTER_IMAGES_DIR: /executer-images + API_URL: ${INDUCTIVA_API_URL} + USER_API_KEY: ${INDUCTIVA_API_KEY} + MACHINE_GROUP_NAME: ${MACHINE_GROUP_NAME} + HOST_NAME: ${TASK_RUNNER_HOSTNAME:-${HOSTNAME}} + volumes: + - ./apptainer:/executer-images + - workdir:/workdir + network_mode: host + extra_hosts: + - "host.docker.internal:host-gateway" + privileged: true + platform: linux/amd64 + +volumes: + workdir: diff --git a/file-tracker/setup.py b/file-tracker/setup.py index 7c32adff..6b12bc4b 100644 --- a/file-tracker/setup.py +++ b/file-tracker/setup.py @@ -2,7 +2,7 @@ from setuptools import find_packages, setup setup( - name="inductiva-api-task-runner", + name="inductiva-api-file-tracker", packages=find_packages(), version="0.1.0", ) diff --git a/task-runner/task_runner/executers/__init__.py b/task-runner/task_runner/executers/__init__.py index 18483513..f7e4facf 100644 --- a/task-runner/task_runner/executers/__init__.py +++ b/task-runner/task_runner/executers/__init__.py @@ -7,7 +7,6 @@ from .exec_command_logger import ExecCommandLogger # noqa: I001 from .base_executer import BaseExecuter, ExecuterSubProcessError # noqa: I001 from .command import Command # noqa: I001 -from .mpi_base_executer import MPIExecuter # noqa: I001 from .mpi_configuration import MPIClusterConfiguration # noqa: I001 from .subprocess_tracker import SubprocessTracker # noqa: I001 from . import ( diff --git a/task-runner/task_runner/executers/arbitrary_commands_executer.py b/task-runner/task_runner/executers/arbitrary_commands_executer.py index 6b75382c..661f1aa7 100644 --- a/task-runner/task_runner/executers/arbitrary_commands_executer.py +++ b/task-runner/task_runner/executers/arbitrary_commands_executer.py @@ -2,7 +2,6 @@ import getpass import os -import shutil from task_runner import executers from task_runner.utils import files @@ -18,8 +17,6 @@ def execute(self): else: global_env = {} - input_dir = os.path.join(self.working_dir, self.args.sim_dir) - run_subprocess_dir = self.artifacts_dir_container if hasattr(self.args, @@ -27,9 +24,6 @@ def execute(self): run_subprocess_dir = os.path.join(self.artifacts_dir_container, self.args.run_subprocess_dir) - # Copy the input files to the artifacts directory - shutil.copytree(input_dir, self.artifacts_dir, dirs_exist_ok=True) - original_username = None if self.commands_user: original_username = getpass.getuser() diff --git a/task-runner/task_runner/executers/base_executer.py b/task-runner/task_runner/executers/base_executer.py index cb639c24..873a9405 100644 --- a/task-runner/task_runner/executers/base_executer.py +++ b/task-runner/task_runner/executers/base_executer.py @@ -4,6 +4,7 @@ its usage. """ import os +import shutil import threading import time from abc import ABC, abstractmethod @@ -92,7 +93,13 @@ def __init__( logging.info("Working directory: %s", self.working_dir) - os.makedirs(self.artifacts_dir) + # Move the inputs from sim_dir/ to artifacts_dir/ so that artifacts_dir/ + # already contains the inputs, avoiding the need for copying + shutil.move( + src=f"{self.working_dir}/{self.args.sim_dir}", + dst=self.artifacts_dir, + ) + logging.info("Created output directory: %s", self.output_dir) logging.info("Created artifacts directory: %s", self.artifacts_dir) diff --git a/task-runner/task_runner/executers/mpi_base_executer.py b/task-runner/task_runner/executers/mpi_base_executer.py deleted file mode 100644 index 71e9bcdc..00000000 --- a/task-runner/task_runner/executers/mpi_base_executer.py +++ /dev/null @@ -1,66 +0,0 @@ -"""This file provides a class to implement MPI executers.""" -import os -import shutil -from typing import Any - -from task_runner import executers -from task_runner.executers import mpi_configuration - -# Instructions inside Docker containers are run by the root user (as default), -# so we need to allow Open MPI to be run as root. This is usually strongly -# discouraged, but necessary to run Open MPI inside a container. For further -# details, see https://www.open-mpi.org/doc/v4.1/man1/mpirun.1.php#toc25. -MPI_ALLOW = "--allow-run-as-root" -MPI_DISTRIBUTION_FILENAME = "machinefile" - - -class MPIExecuter(executers.BaseExecuter): - """Implementation of a general MPI Executer.""" - - def __init__( - self, - working_dir, - container_image, - mpi_config: mpi_configuration.MPIClusterConfiguration, - exec_command_logger: executers.ExecCommandLogger, - extra_params: dict[str, Any], - sim_binary, - file_type, - sim_specific_input_filename, - ): - super().__init__(working_dir, container_image, mpi_config, - exec_command_logger, extra_params) - self.sim_binary = sim_binary - self.sim_specific_input_filename = sim_specific_input_filename - self.file_type = file_type - - def execute(self): - sim_dir = os.path.join(self.working_dir, self.args.sim_dir) - input_filename = self.args.input_filename - - input_file_full_path = os.path.join(sim_dir, input_filename) - - if not os.path.exists(input_file_full_path): - if os.path.exists(f"{input_file_full_path}.{self.file_type}"): - input_filename = f"{input_file_full_path}.{self.file_type}" - else: - raise ValueError( - f"A file with name {input_filename} doesn't exist.") - - if self.args.n_vcpus: - self.mpi_config.extra_args.extend(["-np", f"{self.args.n_vcpus}"]) - - use_hwthread = bool(self.args.use_hwthread) - - if use_hwthread: - self.mpi_config.extra_args.extend(["--use-hwthread-cpus"]) - # Renaming input file as the simulator expects it to be - os.rename(input_file_full_path, - os.path.join(sim_dir, self.sim_specific_input_filename)) - - cmd = executers.Command(self.sim_binary + " " + - self.sim_specific_input_filename, - is_mpi=True) - self.run_subprocess(cmd, working_dir=sim_dir) - - shutil.copytree(sim_dir, self.artifacts_dir, dirs_exist_ok=True) diff --git a/unified/Dockerfile b/unified/Dockerfile new file mode 100644 index 00000000..795b6c8f --- /dev/null +++ b/unified/Dockerfile @@ -0,0 +1,26 @@ +FROM inductiva/task-runner:latest + +USER root + +# Install file-tracker +COPY /file-tracker/requirements.txt /file-tracker-requirements.txt +RUN pip install --no-cache-dir --upgrade -r /file-tracker-requirements.txt + +COPY /file-tracker /file-tracker +WORKDIR /file-tracker +RUN pip install . + +ENV FILE_TRACKER_HOST=0.0.0.0 +ENV FILE_TRACKER_PORT=5000 + +EXPOSE 5000 + +# Create startup script +WORKDIR / +COPY /unified/start_services.sh /start_services.sh +RUN chmod +x /start_services.sh +RUN chown task-runner:task-runner /start_services.sh + +USER task-runner + +CMD ["/start_services.sh"] diff --git a/unified/start_services.sh b/unified/start_services.sh new file mode 100644 index 00000000..863338bf --- /dev/null +++ b/unified/start_services.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Function to handle cleanup on exit +cleanup() { + echo "Shutting down services..." + # Kill background processes + jobs -p | xargs -r kill + exit 0 +} + +# Set up signal handlers +trap cleanup SIGTERM SIGINT + +echo "Starting unified task-runner and file-tracker services..." + +# Start file-tracker in the background +echo "Starting file-tracker..." +cd /file-tracker +python ./file_tracker/main.py & +FILE_TRACKER_PID=$! + +# Start task-runner in the background +echo "Starting task-runner..." +cd /task-runner +python ./task_runner/main.py & +TASK_RUNNER_PID=$! + +echo "Both services started:" +echo " - File-tracker PID: $FILE_TRACKER_PID" +echo " - Task-runner PID: $TASK_RUNNER_PID" + +# Wait for either process to exit +wait $FILE_TRACKER_PID $TASK_RUNNER_PID + +# If we get here, one of the services exited +echo "One of the services exited. Cleaning up..." +cleanup