From e70d0df8331cbc549bd46f5be2330e9f140bbf3c Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 25 Apr 2025 10:07:09 -0700 Subject: [PATCH] chore: better logging when insufficient resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Example: ``` ▶ Setting up compute cluster... Not enough GPUs available. Requested 16 GPUs, but only 8 are available in the cluster. Retrying placement group creation... 1/6. Next retry in 1 seconds. Not enough GPUs available. Requested 16 GPUs, but only 8 are available in the cluster. Retrying placement group creation... 2/6. Next retry in 2 seconds. Not enough GPUs available. Requested 16 GPUs, but only 8 are available in the cluster. Retrying placement group creation... 3/6. Next retry in 4 seconds. Not enough GPUs available. Requested 16 GPUs, but only 8 are available in the cluster. Retrying placement group creation... 4/6. Next retry in 8 seconds. Not enough GPUs available. Requested 16 GPUs, but only 8 are available in the cluster. Retrying placement group creation... 5/6. Next retry in 16 seconds. ``` Signed-off-by: Terry Kong --- nemo_reinforcer/distributed/virtual_cluster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_reinforcer/distributed/virtual_cluster.py b/nemo_reinforcer/distributed/virtual_cluster.py index 8b6600353c..1213d8d897 100644 --- a/nemo_reinforcer/distributed/virtual_cluster.py +++ b/nemo_reinforcer/distributed/virtual_cluster.py @@ -160,7 +160,8 @@ def __init__( self._init_placement_groups(placement_group_strategy) # Reaching here means we were successful break - except ResourceInsufficientError: + except ResourceInsufficientError as e: + print(e) print( f"Retrying placement group creation... {i + 1}/{max_retries}. Next retry in {2**i} seconds." )