diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index 45b689cb3e..4e9725103a 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -34,9 +34,9 @@ jobs: && sudo apt-get -y install cuda-12-2 libcudnn8=8.9.5.*-1+cuda12.2 if: false # skip as we use nvidia image - name: Set PyPI mirror for Aliyun cloud machine - run: python -m pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple/ + run: python -m pip config --user set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple/ - run: python -m pip install -U "pip>=21.3.1,!=23.0.0" - - run: python -m pip install "tensorflow>=2.15.0rc0" + - run: python -m pip install "tensorflow>=2.15.0rc0" "torch>=2.2.0" - run: python -m pip install -v -e .[gpu,test,lmp,cu12,torch] "ase @ https://gitlab.com/ase/ase/-/archive/8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f/ase-8c5aa5fd6448c5cfb517a014dccf2b214a9dfa8f.tar.gz" env: DP_BUILD_TESTING: 1 @@ -44,7 +44,7 @@ jobs: CUDA_PATH: /usr/local/cuda-12.2 NUM_WORKERS: 0 - run: dp --version - - run: python -m pytest -s --cov=deepmd source/tests --durations=0 + - run: python -m pytest --cov=deepmd source/tests --durations=0 - run: source/install/test_cc_local.sh env: OMP_NUM_THREADS: 1 @@ -58,8 +58,8 @@ jobs: - run: | export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/dp_test/lib:$CUDA_PATH/lib64:$LD_LIBRARY_PATH export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH - python -m pytest -s --cov=deepmd source/lmp/tests - python -m pytest -s --cov=deepmd source/ipi/tests + python -m pytest --cov=deepmd source/lmp/tests + python -m pytest --cov=deepmd source/ipi/tests env: OMP_NUM_THREADS: 1 TF_INTRA_OP_PARALLELISM_THREADS: 1 diff --git a/deepmd/tf/env.py b/deepmd/tf/env.py index eada2774d3..993768c4a4 100644 --- a/deepmd/tf/env.py +++ b/deepmd/tf/env.py @@ -483,6 +483,9 @@ def _get_package_constants( op_module = get_module("deepmd_op") op_grads_module = get_module("op_grads") +# prevent OOM when using with other backends +# tf.config doesn't work for unclear reason +set_env_if_empty("TF_FORCE_GPU_ALLOW_GROWTH", "true", verbose=False) # FLOAT_PREC GLOBAL_TF_FLOAT_PRECISION = tf.dtypes.as_dtype(GLOBAL_NP_FLOAT_PRECISION) diff --git a/source/tests/pt/conftest.py b/source/tests/pt/conftest.py new file mode 100644 index 0000000000..a1dea6da5a --- /dev/null +++ b/source/tests/pt/conftest.py @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: LGPL-3.0-or-later +import pytest +import torch + + +@pytest.fixture(scope="package", autouse=True) +def clear_cuda_memory(request): + yield + torch.cuda.empty_cache()