From d6e59cde4f5db74c29e00f414761f5e0ad00ce05 Mon Sep 17 00:00:00 2001 From: heyufan1995 Date: Tue, 27 Aug 2024 15:35:23 -0400 Subject: [PATCH 01/12] Fix transpose and patch coords bug Signed-off-by: heyufan1995 --- monai/apps/vista3d/sampler.py | 15 ++++++++------- monai/networks/nets/vista3d.py | 7 +++++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index b7aeb89a2e..7bc091f013 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -58,16 +58,16 @@ def sample_prompt_pairs( labels: [1, 1, H, W, D], ground truth labels. label_set: the label list for the specific dataset. Note if 0 is included in label_set, it will be added into automatic branch training. Recommend removing 0 from label_set - for multi-partially-labeled-dataset training, and adding 0 for finetuning specific dataset. - The reason is region with 0 in one partially labeled dataset may contain foregrounds in - another dataset. + for multi-partially-labeled-dataset training, and adding 0 for finetuning specific dataset. + The reason is region with 0 in one partially labeled dataset may contain foregrounds in + another dataset. max_prompt: int, max number of total prompt, including foreground and background. max_foreprompt: int, max number of prompt from foreground. max_backprompt: int, max number of prompt from background. max_point: maximum number of points for each object. include_background: if include 0 into training prompt. If included, background 0 is treated - the same as foreground. Always be False for multi-partial-dataset training. If needed, - can be true for finetuning specific dataset, . + the same as foreground and points will be sampled. Can be true only if user want to segment + background 0 with point clicks, otherwise always be false. drop_label_prob: probability to drop label prompt. drop_point_prob: probability to drop point prompt. point_sampler: sampler to augment masks with supervoxel. @@ -76,12 +76,13 @@ def sample_prompt_pairs( Returns: label_prompt: [B, 1]. The classes used for training automatic segmentation. point: [B, N, 3]. The corresponding points for each class. - Note that background label prompt requires matching point as well ([0,0,0] is used). + Note that background label prompt requires matching point as well ([0,0,0] is used). point_label: [B, N]. The corresponding point labels for each point (negative or positive). - -1 is used for padding the background label prompt and will be ignored. + -1 is used for padding the background label prompt and will be ignored. prompt_class: [B, 1], exactly the same with label_prompt for label indexing for training loss. label_prompt can be None, and prompt_class is used to identify point classes. """ + # class label number if not labels.shape[0] == 1: raise ValueError("only support batch size 1") diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py index 9148e36542..979a090df0 100644 --- a/monai/networks/nets/vista3d.py +++ b/monai/networks/nets/vista3d.py @@ -336,11 +336,11 @@ def set_auto_grad(self, auto_freeze: bool = False, point_freeze: bool = False): def forward( self, input_images: torch.Tensor, + patch_coords: Sequence[slice] | None = None, point_coords: torch.Tensor | None = None, point_labels: torch.Tensor | None = None, class_vector: torch.Tensor | None = None, prompt_class: torch.Tensor | None = None, - patch_coords: Sequence[slice] | None = None, labels: torch.Tensor | None = None, label_set: Sequence[int] | None = None, prev_mask: torch.Tensor | None = None, @@ -421,7 +421,10 @@ def forward( point_coords, point_labels = None, None if point_coords is None and class_vector is None: - return self.NINF_VALUE + torch.zeros([bs, 1, *image_size], device=device) + logits = self.NINF_VALUE + torch.zeros([bs, 1, *image_size], device=device) + if transpose: + logits = logits.transpose(1, 0) + return logits if self.image_embeddings is not None and kwargs.get("keep_cache", False) and class_vector is None: out, out_auto = self.image_embeddings, None From 1b119766144a9d3bfea417fc456f75c5857f0bac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:38:26 +0000 Subject: [PATCH 02/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/apps/vista3d/sampler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index 7bc091f013..6ede500997 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -58,16 +58,16 @@ def sample_prompt_pairs( labels: [1, 1, H, W, D], ground truth labels. label_set: the label list for the specific dataset. Note if 0 is included in label_set, it will be added into automatic branch training. Recommend removing 0 from label_set - for multi-partially-labeled-dataset training, and adding 0 for finetuning specific dataset. - The reason is region with 0 in one partially labeled dataset may contain foregrounds in - another dataset. + for multi-partially-labeled-dataset training, and adding 0 for finetuning specific dataset. + The reason is region with 0 in one partially labeled dataset may contain foregrounds in + another dataset. max_prompt: int, max number of total prompt, including foreground and background. max_foreprompt: int, max number of prompt from foreground. max_backprompt: int, max number of prompt from background. max_point: maximum number of points for each object. include_background: if include 0 into training prompt. If included, background 0 is treated - the same as foreground and points will be sampled. Can be true only if user want to segment - background 0 with point clicks, otherwise always be false. + the same as foreground and points will be sampled. Can be true only if user want to segment + background 0 with point clicks, otherwise always be false. drop_label_prob: probability to drop label prompt. drop_point_prob: probability to drop point prompt. point_sampler: sampler to augment masks with supervoxel. From a4920306489eb6b2c6f90ec38ca5ea57c55f8daf Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:45:05 +0800 Subject: [PATCH 03/12] fix doc build Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/apps/vista3d/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index 6ede500997..ff9f99542e 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -80,7 +80,7 @@ def sample_prompt_pairs( point_label: [B, N]. The corresponding point labels for each point (negative or positive). -1 is used for padding the background label prompt and will be ignored. prompt_class: [B, 1], exactly the same with label_prompt for label indexing for training loss. - label_prompt can be None, and prompt_class is used to identify point classes. + label_prompt can be None, and prompt_class is used to identify point classes. """ # class label number From 67341661fb0866e8f8d1cb26f7317c2943a1146a Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:55:47 +0800 Subject: [PATCH 04/12] fix format Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/apps/vista3d/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index ff9f99542e..05c69b0307 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -22,6 +22,7 @@ __all__ = ["sample_prompt_pairs"] + ENABLE_SPECIAL = True SPECIAL_INDEX = (23, 24, 25, 26, 27, 57, 128) MERGE_LIST = { @@ -82,7 +83,6 @@ def sample_prompt_pairs( prompt_class: [B, 1], exactly the same with label_prompt for label indexing for training loss. label_prompt can be None, and prompt_class is used to identify point classes. """ - # class label number if not labels.shape[0] == 1: raise ValueError("only support batch size 1") From fc9d4c3b9b89ac63080084da5acb96869d24126e Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:24:40 +0800 Subject: [PATCH 05/12] fix doc Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/apps/vista3d/sampler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index 05c69b0307..21a0ac54b5 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -20,9 +20,6 @@ import torch from torch import Tensor -__all__ = ["sample_prompt_pairs"] - - ENABLE_SPECIAL = True SPECIAL_INDEX = (23, 24, 25, 26, 27, 57, 128) MERGE_LIST = { @@ -31,6 +28,8 @@ 132: [57], # overlap with trachea merge into airway } +__all__ = ["sample_prompt_pairs"] + def _get_point_label(id: int) -> tuple[int, int]: if id in SPECIAL_INDEX and ENABLE_SPECIAL: @@ -83,6 +82,7 @@ def sample_prompt_pairs( prompt_class: [B, 1], exactly the same with label_prompt for label indexing for training loss. label_prompt can be None, and prompt_class is used to identify point classes. """ + # class label number if not labels.shape[0] == 1: raise ValueError("only support batch size 1") From 4f781aacf7b62df38b803fb68f5aa6ea70763862 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 11:41:32 +0800 Subject: [PATCH 06/12] fix doc Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- docs/requirements.txt | 1 - monai/apps/vista3d/sampler.py | 1 - 2 files changed, 2 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index ff94f7b6de..fc72be6b9e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -35,7 +35,6 @@ pydicom h5py nni; platform_system == "Linux" optuna -opencv-python-headless onnx>=1.13.0 onnxruntime; python_version <= '3.10' zarr diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index 21a0ac54b5..80927f190e 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -37,7 +37,6 @@ def _get_point_label(id: int) -> tuple[int, int]: else: return 0, 1 - def sample_prompt_pairs( labels: Tensor, label_set: Sequence[int], From d643d703b653f565b8b6c7023ea74d82e669ef2f Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 13:19:29 +0800 Subject: [PATCH 07/12] fix format Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- docs/requirements.txt | 2 ++ monai/apps/vista3d/sampler.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index fc72be6b9e..7307d8e5f9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -35,9 +35,11 @@ pydicom h5py nni; platform_system == "Linux" optuna +opencv-python-headless onnx>=1.13.0 onnxruntime; python_version <= '3.10' zarr huggingface_hub pyamg>=5.0.0 packaging +polygraphy diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index 80927f190e..ec84fafcf6 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -73,13 +73,19 @@ def sample_prompt_pairs( point_sampler_kwargs: arguments for point_sampler. Returns: - label_prompt: [B, 1]. The classes used for training automatic segmentation. - point: [B, N, 3]. The corresponding points for each class. - Note that background label prompt requires matching point as well ([0,0,0] is used). - point_label: [B, N]. The corresponding point labels for each point (negative or positive). - -1 is used for padding the background label prompt and will be ignored. - prompt_class: [B, 1], exactly the same with label_prompt for label indexing for training loss. - label_prompt can be None, and prompt_class is used to identify point classes. + tuple: + - label_prompt (Tensor | None): Tensor of shape [B, 1] containing the classes used for + training automatic segmentation. + - point (Tensor | None): Tensor of shape [B, N, 3] representing the corresponding points + for each class. Note that background label prompts require matching points as well + (e.g., [0, 0, 0] is used). + - point_label (Tensor | None): Tensor of shape [B, N] representing the corresponding point + labels for each point (negative or positive). -1 is used for padding the background + label prompt and will be ignored. + - prompt_class (Tensor | None): Tensor of shape [B, 1], exactly the same as label_prompt + for label indexing during training. If label_prompt is None, prompt_class is used to + identify point classes. + """ # class label number From af53f6fe6f56ddc563e7c9de78c6ec256a4940b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 28 Aug 2024 05:20:02 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/apps/vista3d/sampler.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index ec84fafcf6..c1429e1d53 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -74,16 +74,16 @@ def sample_prompt_pairs( Returns: tuple: - - label_prompt (Tensor | None): Tensor of shape [B, 1] containing the classes used for + - label_prompt (Tensor | None): Tensor of shape [B, 1] containing the classes used for training automatic segmentation. - - point (Tensor | None): Tensor of shape [B, N, 3] representing the corresponding points - for each class. Note that background label prompts require matching points as well + - point (Tensor | None): Tensor of shape [B, N, 3] representing the corresponding points + for each class. Note that background label prompts require matching points as well (e.g., [0, 0, 0] is used). - - point_label (Tensor | None): Tensor of shape [B, N] representing the corresponding point - labels for each point (negative or positive). -1 is used for padding the background + - point_label (Tensor | None): Tensor of shape [B, N] representing the corresponding point + labels for each point (negative or positive). -1 is used for padding the background label prompt and will be ignored. - - prompt_class (Tensor | None): Tensor of shape [B, 1], exactly the same as label_prompt - for label indexing during training. If label_prompt is None, prompt_class is used to + - prompt_class (Tensor | None): Tensor of shape [B, 1], exactly the same as label_prompt + for label indexing during training. If label_prompt is None, prompt_class is used to identify point classes. """ From 3e0d115d93fe22779c02ab2c91a1bae2dafc3773 Mon Sep 17 00:00:00 2001 From: YunLiu <55491388+KumoLiu@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:11:59 +0800 Subject: [PATCH 09/12] fix format Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com> --- monai/apps/vista3d/sampler.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/monai/apps/vista3d/sampler.py b/monai/apps/vista3d/sampler.py index ec84fafcf6..17b2d34911 100644 --- a/monai/apps/vista3d/sampler.py +++ b/monai/apps/vista3d/sampler.py @@ -37,6 +37,7 @@ def _get_point_label(id: int) -> tuple[int, int]: else: return 0, 1 + def sample_prompt_pairs( labels: Tensor, label_set: Sequence[int], @@ -74,16 +75,16 @@ def sample_prompt_pairs( Returns: tuple: - - label_prompt (Tensor | None): Tensor of shape [B, 1] containing the classes used for + - label_prompt (Tensor | None): Tensor of shape [B, 1] containing the classes used for training automatic segmentation. - - point (Tensor | None): Tensor of shape [B, N, 3] representing the corresponding points - for each class. Note that background label prompts require matching points as well + - point (Tensor | None): Tensor of shape [B, N, 3] representing the corresponding points + for each class. Note that background label prompts require matching points as well (e.g., [0, 0, 0] is used). - - point_label (Tensor | None): Tensor of shape [B, N] representing the corresponding point - labels for each point (negative or positive). -1 is used for padding the background + - point_label (Tensor | None): Tensor of shape [B, N] representing the corresponding point + labels for each point (negative or positive). -1 is used for padding the background label prompt and will be ignored. - - prompt_class (Tensor | None): Tensor of shape [B, 1], exactly the same as label_prompt - for label indexing during training. If label_prompt is None, prompt_class is used to + - prompt_class (Tensor | None): Tensor of shape [B, 1], exactly the same as label_prompt + for label indexing during training. If label_prompt is None, prompt_class is used to identify point classes. """ From 1d287ef152045b5939486de704df5655d1307fd1 Mon Sep 17 00:00:00 2001 From: heyufan1995 Date: Fri, 30 Aug 2024 11:07:21 -0400 Subject: [PATCH 10/12] Fix patch coords bug Signed-off-by: heyufan1995 --- monai/apps/vista3d/inferer.py | 2 +- monai/networks/nets/vista3d.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/monai/apps/vista3d/inferer.py b/monai/apps/vista3d/inferer.py index 709f81f624..8f622ef6cd 100644 --- a/monai/apps/vista3d/inferer.py +++ b/monai/apps/vista3d/inferer.py @@ -100,7 +100,7 @@ def point_based_window_inferer( point_labels=point_labels, class_vector=class_vector, prompt_class=prompt_class, - patch_coords=unravel_slice, + patch_coords=[unravel_slice], prev_mask=prev_mask, **kwargs, ) diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py index 979a090df0..57c46c2e08 100644 --- a/monai/networks/nets/vista3d.py +++ b/monai/networks/nets/vista3d.py @@ -336,7 +336,7 @@ def set_auto_grad(self, auto_freeze: bool = False, point_freeze: bool = False): def forward( self, input_images: torch.Tensor, - patch_coords: Sequence[slice] | None = None, + patch_coords: list[Sequence[slice]] | None = None, point_coords: torch.Tensor | None = None, point_labels: torch.Tensor | None = None, class_vector: torch.Tensor | None = None, @@ -364,8 +364,11 @@ def forward( the points are for zero-shot or supported class. When class_vector and point_coords are both provided, prompt_class is the same as class_vector. For prompt_class[b] > 512, point_coords[b] will be considered novel class. - patch_coords: a sequence of the python slice objects representing the patch coordinates during sliding window inference. - This value is passed from sliding_window_inferer. This is an indicator for training phase or validation phase. + patch_coords: a list of sequence of the python slice objects representing the patch coordinates during sliding window + inference. This value is passed from sliding_window_inferer. This is an indicator for training phase or validation phase. + Notice for sliding window batch size > 1 (only supported by automatic segmentation), patch_coords will inlcude + coordinates of multiple patches. If point prompts are included, the batch size can only be one and all the + functions using patch_coords will by default use patch_coords[0]. labels: [1, 1, H, W, D], the groundtruth label tensor, only used for point-only evaluation label_set: the label index matching the indexes in labels. If labels are mapped to global index using RelabelID, this label_set should be global mapped index. If labels are not mapped to global index, e.g. in zero-shot @@ -395,14 +398,14 @@ def forward( if val_point_sampler is None: # TODO: think about how to refactor this part. val_point_sampler = self.sample_points_patch_val - point_coords, point_labels, prompt_class = val_point_sampler(labels, patch_coords, label_set) + point_coords, point_labels, prompt_class = val_point_sampler(labels, patch_coords[0], label_set) if prompt_class[0].item() == 0: # type: ignore point_labels[0] = -1 # type: ignore labels, prev_mask = None, None elif point_coords is not None: # If not performing patch-based point only validation, use user provided click points for inference. # the point clicks is in original image space, convert it to current patch-coordinate space. - point_coords, point_labels = self.update_point_to_patch(patch_coords, point_coords, point_labels) # type: ignore + point_coords, point_labels = self.update_point_to_patch(patch_coords[0], point_coords, point_labels) # type: ignore if point_coords is not None and point_labels is not None: # remove points that used for padding purposes (point_label = -1) @@ -455,7 +458,7 @@ def forward( logits[mapping_index] = self.point_head(out, point_coords, point_labels, class_vector=prompt_class) if prev_mask is not None and patch_coords is not None: logits = self.connected_components_combine( - prev_mask[patch_coords].transpose(1, 0).to(logits.device), + prev_mask[patch_coords[0]].transpose(1, 0).to(logits.device), logits[mapping_index], point_coords, # type: ignore point_labels, # type: ignore From 7f4ae69aec75eac03f8731af2c86ece8d7312ac4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 04:43:29 +0000 Subject: [PATCH 11/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- monai/networks/nets/vista3d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py index 57c46c2e08..42ae0983b4 100644 --- a/monai/networks/nets/vista3d.py +++ b/monai/networks/nets/vista3d.py @@ -364,9 +364,9 @@ def forward( the points are for zero-shot or supported class. When class_vector and point_coords are both provided, prompt_class is the same as class_vector. For prompt_class[b] > 512, point_coords[b] will be considered novel class. - patch_coords: a list of sequence of the python slice objects representing the patch coordinates during sliding window + patch_coords: a list of sequence of the python slice objects representing the patch coordinates during sliding window inference. This value is passed from sliding_window_inferer. This is an indicator for training phase or validation phase. - Notice for sliding window batch size > 1 (only supported by automatic segmentation), patch_coords will inlcude + Notice for sliding window batch size > 1 (only supported by automatic segmentation), patch_coords will inlcude coordinates of multiple patches. If point prompts are included, the batch size can only be one and all the functions using patch_coords will by default use patch_coords[0]. labels: [1, 1, H, W, D], the groundtruth label tensor, only used for point-only evaluation From a22efe40df6c335e1f79d12b2cc5836c235cedf6 Mon Sep 17 00:00:00 2001 From: Yiheng Wang Date: Mon, 2 Sep 2024 04:49:50 +0000 Subject: [PATCH 12/12] fix flake8 Signed-off-by: Yiheng Wang --- monai/networks/nets/vista3d.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py index 57c46c2e08..4215a9a594 100644 --- a/monai/networks/nets/vista3d.py +++ b/monai/networks/nets/vista3d.py @@ -364,9 +364,10 @@ def forward( the points are for zero-shot or supported class. When class_vector and point_coords are both provided, prompt_class is the same as class_vector. For prompt_class[b] > 512, point_coords[b] will be considered novel class. - patch_coords: a list of sequence of the python slice objects representing the patch coordinates during sliding window - inference. This value is passed from sliding_window_inferer. This is an indicator for training phase or validation phase. - Notice for sliding window batch size > 1 (only supported by automatic segmentation), patch_coords will inlcude + patch_coords: a list of sequence of the python slice objects representing the patch coordinates during sliding window + inference. This value is passed from sliding_window_inferer. + This is an indicator for training phase or validation phase. + Notice for sliding window batch size > 1 (only supported by automatic segmentation), patch_coords will inlcude coordinates of multiple patches. If point prompts are included, the batch size can only be one and all the functions using patch_coords will by default use patch_coords[0]. labels: [1, 1, H, W, D], the groundtruth label tensor, only used for point-only evaluation