diff --git a/docs/source/precision_accelerating.md b/docs/source/precision_accelerating.md index e7e0ddb2b4..897a6f1652 100644 --- a/docs/source/precision_accelerating.md +++ b/docs/source/precision_accelerating.md @@ -33,6 +33,9 @@ Please note that there are environment variables that can override the flags abo If you are using an [NGC PyTorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch), the container includes a layer `ENV TORCH_ALLOW_TF32_CUBLAS_OVERRIDE=1`. The default value `torch.backends.cuda.matmul.allow_tf32` will be overridden to `True`. +To restore the upstream default value, please run `unset TORCH_ALLOW_TF32_CUBLAS_OVERRIDE` in the container, +and use the Pytorch API `torch.set_float32_matmul_precision`, `torch.backends.cudnn.allow_tf32=False` accordingly. + We recommend that users print out these two flags for confirmation when unsure. diff --git a/monai/config/deviceconfig.py b/monai/config/deviceconfig.py index 5d7aee6c75..854a4274c4 100644 --- a/monai/config/deviceconfig.py +++ b/monai/config/deviceconfig.py @@ -205,6 +205,8 @@ def get_gpu_info() -> OrderedDict: _dict_append(output, "CUDA version", lambda: torch.version.cuda) cudnn_ver = torch.backends.cudnn.version() _dict_append(output, "cuDNN enabled", lambda: bool(cudnn_ver)) + _dict_append(output, "NVIDIA_TF32_OVERRIDE", os.environ.get("NVIDIA_TF32_OVERRIDE")) + _dict_append(output, "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE", os.environ.get("TORCH_ALLOW_TF32_CUBLAS_OVERRIDE")) if cudnn_ver: _dict_append(output, "cuDNN version", lambda: cudnn_ver) @@ -215,12 +217,12 @@ def get_gpu_info() -> OrderedDict: for gpu in range(num_gpus): gpu_info = torch.cuda.get_device_properties(gpu) - _dict_append(output, f"GPU {gpu} Name", lambda: gpu_info.name) - _dict_append(output, f"GPU {gpu} Is integrated", lambda: bool(gpu_info.is_integrated)) - _dict_append(output, f"GPU {gpu} Is multi GPU board", lambda: bool(gpu_info.is_multi_gpu_board)) - _dict_append(output, f"GPU {gpu} Multi processor count", lambda: gpu_info.multi_processor_count) - _dict_append(output, f"GPU {gpu} Total memory (GB)", lambda: round(gpu_info.total_memory / 1024**3, 1)) - _dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", lambda: f"{gpu_info.major}.{gpu_info.minor}") + _dict_append(output, f"GPU {gpu} Name", gpu_info.name) + _dict_append(output, f"GPU {gpu} Is integrated", bool(gpu_info.is_integrated)) + _dict_append(output, f"GPU {gpu} Is multi GPU board", bool(gpu_info.is_multi_gpu_board)) + _dict_append(output, f"GPU {gpu} Multi processor count", gpu_info.multi_processor_count) + _dict_append(output, f"GPU {gpu} Total memory (GB)", round(gpu_info.total_memory / 1024**3, 1)) + _dict_append(output, f"GPU {gpu} CUDA capability (maj.min)", f"{gpu_info.major}.{gpu_info.minor}") return output diff --git a/monai/utils/tf32.py b/monai/utils/tf32.py index 9ef425ab8b..cfb023bdeb 100644 --- a/monai/utils/tf32.py +++ b/monai/utils/tf32.py @@ -52,7 +52,7 @@ def has_ampere_or_later() -> bool: @functools.lru_cache(None) def detect_default_tf32() -> bool: """ - Dectect if there is anything that may enable TF32 mode by default. + Detect if there is anything that may enable TF32 mode by default. If any, show a warning message. """ may_enable_tf32 = False @@ -70,7 +70,7 @@ def detect_default_tf32() -> bool: ) may_enable_tf32 = True - override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1", "TORCH_ALLOW_TF32_CUBLAS_OVERRIDE": "1"} + override_tf32_env_vars = {"NVIDIA_TF32_OVERRIDE": "1"} # TORCH_ALLOW_TF32_CUBLAS_OVERRIDE not checked #6907 for name, override_val in override_tf32_env_vars.items(): if os.environ.get(name) == override_val: warnings.warn(