From ad2640bf4e0c4754a990a0f9c85d9fe0693206e8 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Mon, 23 Dec 2024 17:21:09 +0800 Subject: [PATCH] fix is_oom_error bug --- deepmd/pd/utils/auto_batch_size.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/deepmd/pd/utils/auto_batch_size.py b/deepmd/pd/utils/auto_batch_size.py index 8cdb5ddea2..0eb5e46d5f 100644 --- a/deepmd/pd/utils/auto_batch_size.py +++ b/deepmd/pd/utils/auto_batch_size.py @@ -49,12 +49,8 @@ def is_oom_error(self, e: Exception) -> bool: # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error, # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924 # (the meaningless error message should be considered as a bug in cusolver) - if isinstance(e, RuntimeError) and ( - "CUDA out of memory." in e.args[0] - or "CUDA driver error: out of memory" in e.args[0] - or "cusolver error: CUSOLVER_STATUS_INTERNAL_ERROR" in e.args[0] - ): + if isinstance(e, MemoryError) and ("ResourceExhaustedError" in e.args[0]): # Release all unoccupied cached memory - # paddle.device.cuda.empty_cache() + paddle.device.cuda.empty_cache() return True return False