diff --git a/dali/python/nvidia/dali/_conditionals.py b/dali/python/nvidia/dali/_conditionals.py index d3300e5b81f..6204c9bb09d 100644 --- a/dali/python/nvidia/dali/_conditionals.py +++ b/dali/python/nvidia/dali/_conditionals.py @@ -711,6 +711,6 @@ def lazy_or(self, a_value, b): _autograph.initialize_autograph( _OVERLOADS, - convert_modules=["nvidia.dali.auto_aug"], + convert_modules=["nvidia.dali.auto_aug", "nvidia.dali.experimental.torchvision"], do_not_convert_modules=["nvidia.dali._autograph", "nvidia.dali"], ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/__init__.py new file mode 100644 index 00000000000..a1b0a869eaa --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .v2.compose import Compose +from .v2.flips import RandomHorizontalFlip, RandomVerticalFlip +from .v2.resize import Resize + +__all__ = [ + "Compose", + "RandomHorizontalFlip", + "RandomVerticalFlip", + "Resize", +] diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/compose.py b/dali/python/nvidia/dali/experimental/torchvision/v2/compose.py new file mode 100644 index 00000000000..d3140a29c7a --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/compose.py @@ -0,0 +1,399 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import List, Sequence, Callable, Union + +import nvidia.dali.fn as fn +from nvidia.dali.pipeline import pipeline_def +from nvidia.dali.data_node import DataNode as _DataNode +from nvidia.dali.backend import TensorListCPU, TensorListGPU + +from .operator import VerificationTensorOrImage + +import numpy as np +import multiprocessing +from PIL import Image +import torch + +DEFAULT_BATCH_SIZE = 16 +DEFAULT_NUM_THREADS = 1 if multiprocessing.cpu_count() == 1 else multiprocessing.cpu_count() // 2 + + +def _to_torch_tensor(tensor_or_tl: TensorListGPU | TensorListCPU) -> torch.Tensor: + if isinstance(tensor_or_tl, (TensorListGPU, TensorListCPU)): + dali_tensor = tensor_or_tl.as_tensor() + else: + dali_tensor = tensor_or_tl + + return torch.from_dlpack(dali_tensor) + + +def to_torch_tensor( + x: Union[tuple, "TensorListGPU", "TensorListCPU"], +) -> Union[torch.Tensor, tuple]: + """ + Converts a DALI tensor or tensor list to a PyTorch tensor. + + Parameters + ---------- + tensor_or_tl : tuple, TensorListGPU, TensorListCPU + DALI tensor or tensor list. + """ + if isinstance(x, (TensorListGPU, TensorListCPU)): + return to_torch_tensor(x.as_tensor()) + elif isinstance(x, tuple): + if len(x) == 1: + return _to_torch_tensor(x[0]) + return tuple(to_torch_tensor(elem) for elem in x) + else: + return torch.from_dlpack(x) + + +@pipeline_def(enable_conditionals=True, exec_dynamic=True, prefetch_queue_depth=1) +def _pipeline_function(op_list, layout="HWC"): + """ + Builds a DALI pipeline from a list of operators. + + Parameters + ---------- + op_list : list + List of DALI operators. + layout : str + Layout of the data. + """ + input_node = fn.external_source(name="input_data", no_copy=True, layout=layout) + for op in op_list: + input_node = op(input_node) + return input_node + + +class PipelineWithLayout(ABC): + """Base class for pipeline layouts. + + This class is a base class for DALI pipelines with a specific layout. It is used to handle + the layout of the data. + Single DALI Pipeline can only use one layout at a time. + + Parameters + ---------- + op_list : list + List of DALI operators. + layout : str + Layout of the data. + batch_size : int, optional, default = DEFAULT_BATCH_SIZE + Batch size. + num_threads : int, optional, default = DEFAULT_NUM_THREADS + Number of threads. + **dali_pipeline_kwargs + Additional keyword arguments for the DALI pipeline. + """ + + def _cuda_run(self, data_input): + if isinstance(data_input, torch.Tensor) and data_input.is_cuda: + device_id = data_input.device.index + else: + device_id = torch.cuda.current_device() + stream = torch.cuda.Stream(device=device_id) + + with torch.cuda.stream(stream): + output = self.pipe.run(stream, input_data=data_input) + + return output + + def _cpu_run(self, data_input): + return self.pipe.run(input_data=data_input) + + def __init__( + self, + op_list: List[Callable[..., Sequence[_DataNode] | _DataNode]], + layout: str, + batch_size: int = DEFAULT_BATCH_SIZE, + num_threads: int = DEFAULT_NUM_THREADS, + **dali_pipeline_kwargs, + ): + # TODO: + # convert_to_tensor is currently not supported and requires an user's effort + # to convert to tensor + # ToTensor is deprecated and according to: + # https://docs.pytorch.org/vision/stable/_modules/torchvision/transforms/v2/_deprecated.html#ToTensor + # should be replaced with: + # v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)]) + # + # self.convert_to_tensor = True if isinstance(op_list[-1], ToTensor) else False + self.convert_to_tensor = False + self.pipe = _pipeline_function( + op_list, + layout=layout, + batch_size=batch_size, + num_threads=num_threads, + **dali_pipeline_kwargs, + ) + self._internal_run = self._cuda_run if torch.cuda.is_available() else self._cpu_run + + def run(self, data_input): + + output = self._internal_run(data_input) + + if output is None: + return output + + output = to_torch_tensor(output) + # ToTensor + if self.convert_to_tensor: + if output.shape[-4] > 1: + raise NotImplementedError("ToTensor does not currently work for batches") + + return output + + @abstractmethod + def get_layout(self) -> str: ... + + @abstractmethod + def get_channel_reverse_idx(self) -> int: ... + + @abstractmethod + def verify_layout(self, data) -> None: ... + + def is_conversion_to_tensor(self) -> bool: + return self.convert_to_tensor + + +class PipelineHWC(PipelineWithLayout): + """Handles ``PIL.Image`` in HWC format. + + This class prepares data to be passed to a DALI pipeline, runs the pipeline and converts + the output to a ``PIL.Image``. + + Parameters + ---------- + op_list : list + List of DALI operators. + batch_size : int, optional, default = DEFAULT_BATCH_SIZE + Batch size. + num_threads : int, optional, default = DEFAULT_NUM_THREADS + Number of threads. + **dali_pipeline_kwargs + Additional keyword arguments for the DALI pipeline. + """ + + def __init__( + self, + op_list: List[Callable[..., Sequence[_DataNode] | _DataNode]], + batch_size: int = DEFAULT_BATCH_SIZE, + num_threads: int = DEFAULT_NUM_THREADS, + **dali_pipeline_kwargs, + ): + super().__init__( + op_list, + layout="HWC", + batch_size=batch_size, + num_threads=num_threads, + **dali_pipeline_kwargs, + ) + + def _convert_tensor_to_image(self, in_tensor: torch.Tensor): + + channels = self.get_channel_reverse_idx() + + # TODO: consider when to convert to PIL.Image - e.g. if it make sense for channels < 3 + # There is no certain method to determine if the tensor is HW, HWC, or NHWC. + # The method below checks if tensor's shape is HW or ...HWC with a single channel + if len(in_tensor.shape) == 2 or ( + len(in_tensor.shape) >= 3 and in_tensor.shape[channels] == 1 + ): + mode = "L" + if len(in_tensor.shape) != 2: + in_tensor = in_tensor.squeeze(-1) + elif in_tensor.shape[channels] == 3: + mode = "RGB" + elif in_tensor.shape[channels] == 4: + mode = "RGBA" + else: + raise ValueError( + f"Unsupported number of channels: {in_tensor.shape[channels]}. Should be 1, 3 or 4." + ) + # We need to convert tensor to CPU, PIL does not support CUDA tensors + return Image.fromarray(in_tensor.cpu().numpy(), mode=mode) + + def run(self, data_input): + if isinstance(data_input, Image.Image): + _input = torch.as_tensor(np.array(data_input, copy=True)).unsqueeze(0) + if data_input.mode == "L": + _input = _input.unsqueeze(-1) + else: + raise ValueError( + "HWC layout is currently supported for PIL Images only.\ + Please check if samples have the same format." + ) + + output = super().run(_input) + + if self.is_conversion_to_tensor(): + return output + + if isinstance(output, tuple): + output = self._convert_tensor_to_image(output[0]) + else: + # batches + if output.shape[0] > 1: + output_list = [] + for i in range(output.shape[0]): + output_list.append(self._convert_tensor_to_image(output[i])) + output = output_list + else: + output = self._convert_tensor_to_image(output[0]) + + return output + + def get_layout(self) -> str: + return "HWC" + + def get_channel_reverse_idx(self) -> int: + return -1 + + def verify_layout(self, data_input) -> None: + if not isinstance(data_input, Image.Image): + raise TypeError(f"The pipeline expects PIL.Images as input got {type(data_input)}") + + +class PipelineCHW(PipelineWithLayout): + """Handles ``torch.Tensors`` in CHW format. + + This class prepares data to be passed to a DALI pipeline and runs the pipeline, converting + the output to a ``torch.Tensor``. + + Parameters + ---------- + op_list : list + List of DALI operators. + batch_size : int, optional, default = DEFAULT_BATCH_SIZE + Batch size. + num_threads : int, optional, default = DEFAULT_NUM_THREADS + Number of threads. + **dali_pipeline_kwargs + Additional keyword arguments for the DALI pipeline. + """ + + def __init__( + self, + op_list: List[Callable[..., Sequence[_DataNode] | _DataNode]], + batch_size: int = DEFAULT_BATCH_SIZE, + num_threads: int = DEFAULT_NUM_THREADS, + **dali_pipeline_kwargs, + ): + super().__init__( + op_list, + layout="CHW", + batch_size=batch_size, + num_threads=num_threads, + **dali_pipeline_kwargs, + ) + + def run(self, data_input): + if isinstance(data_input, torch.Tensor): + _input = data_input + if data_input.ndim == 3: + # DALI requires batch size to be present + _input = data_input.unsqueeze(0) + else: + raise ValueError( + "CHW layout is currently supported for torch.Tensor only.\ + Please check if samples have the same format." + ) + output = super().run(_input) + + if data_input.ndim == 3: + # DALI requires batch size to be present + output = output.squeeze(0) + return output + + def get_layout(self) -> str: + return "CHW" + + def get_channel_reverse_idx(self) -> int: + return -3 + + def verify_layout(self, data_input) -> None: + if not isinstance(data_input, torch.Tensor): + raise TypeError(f"The pipeline expects torch.Tensor as input got {type(data_input)}") + + +class Compose: + """ + Composes transforms together in a single pipeline + + This class chains multiple DALI operations in a sequential manner, similar to + ``torchvision.transforms.Compose``. The ``Compose`` class implements a callable which runs + the pipeline. + + Parameters + ---------- + op_list : list + List of DALI operators. + batch_size : int, optional, default = DEFAULT_BATCH_SIZE + Batch size. + num_threads : int, optional, default = DEFAULT_NUM_THREADS + Number of threads. + **dali_pipeline_kwargs + Additional keyword arguments for the DALI pipeline. + """ + + def __init__( + self, + op_list: List[Callable[..., Sequence[_DataNode] | _DataNode]], + batch_size: int = DEFAULT_BATCH_SIZE, + num_threads: int = DEFAULT_NUM_THREADS, + **dali_pipeline_kwargs, + ): + self.op_list = op_list + self.batch_size = batch_size + self.num_threads = num_threads + self.active_pipeline = None + self.dali_pipeline_kwargs = dali_pipeline_kwargs + + def _build_pipeline(self, data_input): + if isinstance(data_input, Image.Image): + self.active_pipeline = PipelineHWC( + self.op_list, self.batch_size, self.num_threads, **self.dali_pipeline_kwargs + ) + elif isinstance(data_input, torch.Tensor): + self.active_pipeline = PipelineCHW( + self.op_list, self.batch_size, self.num_threads, **self.dali_pipeline_kwargs + ) + else: + raise ValueError("Currently only PILImages and torch.Tensors are supported") + + def __call__(self, data_input): + """ + Runs the pipeline + + The ``Pipeline`` class builds a graph based on the operations list passed in + the constructor. Next, whenever the ``Compose`` object is called it starts the pipeline + and returns results. + + Parameters + ---------- + data_input: Tensor or PIL Image + In case of PIL image it will be converted to tensor before sending to pipeline + """ + + VerificationTensorOrImage.verify(data_input) + + if self.active_pipeline is None: + self._build_pipeline(data_input) + + self.active_pipeline.verify_layout(data_input) + + return self.active_pipeline.run(data_input=data_input) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/flips.py b/dali/python/nvidia/dali/experimental/torchvision/v2/flips.py new file mode 100644 index 00000000000..a3c6b209477 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/flips.py @@ -0,0 +1,83 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Literal +from .operator import Operator +import nvidia.dali.fn as fn + + +class RandomFlip(Operator): + """ + Randomly flips the given image randomly with a given probability. + + Parameters + ---------- + p : float + Probability of the image being flipped. Default value is 0.5 + horizontal : bool + Flip the horizontal dimension if True, vertical otherwise + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the flip. Can be ``"cpu"`` or ``"gpu"``. + """ + + def __init__( + self, p: float = 0.5, horizontal: bool = True, device: Literal["cpu", "gpu"] = "cpu" + ): + super().__init__(device=device) + self.prob = p + self.horizontal = horizontal + + def _kernel(self, data_input): + if self.horizontal: + data_input = fn.flip( + data_input, horizontal=fn.random.coin_flip(probability=self.prob), vertical=0 + ) + else: + data_input = fn.flip( + data_input, horizontal=0, vertical=fn.random.coin_flip(probability=self.prob) + ) + + return data_input + + +class RandomHorizontalFlip(RandomFlip): + """ + Randomly horizontally flips the given image randomly with a given probability. + + Parameters + ---------- + p : float + Probability of the image being flipped. Default value is 0.5 + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the flip. Can be ``"cpu"`` or ``"gpu"``. + """ + + def __init__(self, p: float = 0.5, device: Literal["cpu", "gpu"] = "cpu"): + super().__init__(p, True, device) + + +class RandomVerticalFlip(RandomFlip): + """ + Randomly vertically flips the given image randomly with a given probability. + + Parameters + ---------- + p : float + Probability of the image being flipped. Default value is 0.5 + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the flip. Can be ``"cpu"`` or ``"gpu"``. + """ + + def __init__(self, p: float = 0.5, device: Literal["cpu", "gpu"] = "cpu"): + super().__init__(p, False, device) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py new file mode 100644 index 00000000000..fedd6bc05bb --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .flips import horizontal_flip, vertical_flip +from .resize import resize + +__all__ = [ + "horizontal_flip", + "resize", + "vertical_flip", +] diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/flips.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/flips.py new file mode 100644 index 00000000000..bb6763ff250 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/flips.py @@ -0,0 +1,39 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import nvidia.dali.experimental.dynamic as ndd + +import torch +from PIL import Image + +from ..operator import adjust_input # noqa: E402 + + +@adjust_input +def horizontal_flip(inpt: Image.Image | torch.Tensor) -> Image.Image | torch.Tensor: + """ + Horizontally flips the given tensor. + Refer to `HorizontalFlip` for more details. + """ + return ndd.flip(inpt, horizontal=1, vertical=0) + + +@adjust_input +def vertical_flip(inpt: Image.Image | torch.Tensor) -> Image.Image | torch.Tensor: + """ + Vertically flips the given tensor. + Refer to `VerticalFlip` for more details. + """ + return ndd.flip(inpt, horizontal=0, vertical=1) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/functional/resize.py b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/resize.py new file mode 100644 index 00000000000..297217d9f11 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/functional/resize.py @@ -0,0 +1,73 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, List, Literal +import nvidia.dali.experimental.dynamic as ndd +from torchvision.transforms import InterpolationMode + +import torch +from PIL import Image + +from ..operator import adjust_input # noqa: E402 +from ..resize import Resize # noqa: E402 + + +@adjust_input +def resize( + inpt: Image.Image | torch.Tensor, + size: List[int], + interpolation: InterpolationMode = InterpolationMode.BILINEAR, + max_size: Optional[int] = None, + antialias: Optional[bool] = True, + device: Literal["cpu", "gpu"] = "cpu", +) -> Image.Image | torch.Tensor: + """ + Please refer to the ``Resize`` operator for more details. + """ + Resize.verify_args( + size=size, max_size=max_size, interpolation=interpolation, antialias=antialias + ) + + size_normalized = Resize.infer_effective_size(size, max_size) + interpolation = Resize.interpolation_modes[interpolation] + + if isinstance(inpt, ndd.Tensor): + inpt_shape = inpt.shape + elif isinstance(inpt, ndd.Batch): + inpt_shape = inpt.shape[0] # Batches have uniform layout + else: + raise TypeError(f"Input must be ndd.Tensor or ndd.Batch got {type(inpt)}") + + if inpt.layout in ["HWC", "NHWC"]: + original_h = inpt_shape[-3] + original_w = inpt_shape[-2] + elif inpt.layout in ["HW", "CHW", "NCHW"]: + original_h = inpt_shape[-2] + original_w = inpt_shape[-1] + else: + raise ValueError( + f"Unsupported layout: {inpt.layout!r}. Expected one of HWC, NHWC, CHW, NCHW." + ) + + target_h, target_w = Resize.calculate_target_size_dynamic_mode( + (original_h, original_w), size_normalized, max_size + ) + + return ndd.resize( + inpt, + device=device, + size=(target_h, target_w), + interp_type=interpolation, + antialias=antialias, + ) diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/operator.py b/dali/python/nvidia/dali/experimental/torchvision/v2/operator.py new file mode 100644 index 00000000000..8e612ac7479 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/operator.py @@ -0,0 +1,313 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from typing import Sequence, Literal + +from PIL import Image +import torch +import numpy as np + +import nvidia.dali.experimental.dynamic as ndd + + +class DataVerificationRule(ABC): + """ + Abstract base class for data verification rules + + Implement ``verify`` method in a child class raising an exception in case of failed verification + """ + + @classmethod + @abstractmethod + def verify(cls, data) -> None: + pass + + +class ArgumentVerificationRule(ABC): + """ + Abstract base class for input verification rules + + Implement ``verify`` method in a child class raising an exception in case of failed verification + """ + + @classmethod + @abstractmethod + def verify(cls, **kwargs) -> None: + pass + + +class VerificationIsTensor(DataVerificationRule): + """ + Verify if the data is a ``torch.Tensor``. + + Parameters + ---------- + data : any + Data to verify. Should be a ``torch.Tensor``. + """ + + @classmethod + def verify(cls, data): + if not isinstance(data, (torch.Tensor)): + raise TypeError(f"Data should be Tensor. Got {type(data)}") + + +class VerificationTensorOrImage(DataVerificationRule): + """ + Verify if the data is a ``torch.Tensor`` or ``PIL.Image``. + + Parameters + ---------- + data : any + Data to verify. Should be a ``torch.Tensor`` or ``PIL.Image``. + """ + + @classmethod + def verify(cls, data): + if not isinstance(data, (Image.Image, torch.Tensor)): + raise TypeError(f"inpt should be Tensor or PIL Image. Got {type(data)}") + + +class VerificationChannelCount(DataVerificationRule): + """ + Verify if input data has <= 4 channels. More channels are not supported in Torchvision + + Parameters + ---------- + data : any + Data to verify in CHW format. + """ + + CHANNELS = [1, 2, 3, 4] + + @classmethod + def verify(cls, data): + if ( + isinstance(data, torch.Tensor) + and data.shape[-3] not in VerificationChannelCount.CHANNELS + ): + raise ValueError( + f"Input should be in CHW if Tensor. \ + Supports up to {VerificationChannelCount.CHANNELS[-1]} channels, \ + got: {data.shape[-3]} channels" + ) + + +class VerifyIfPositive(ArgumentVerificationRule): + """ + Verify if the value is positive. + + Parameters + ---------- + values : any + Value to verify. Should be a positive numbers. + """ + + @classmethod + def verify(cls, *, values, name, **_) -> None: + if isinstance(values, (int, float)) and values <= 0: + raise ValueError(f"Value {name} must be positive, got {values}") + elif isinstance(values, (list, tuple)) and any(k <= 0 for k in values): + raise ValueError(f"Values {name} should be positive numbers, got {values}") + + +class VerifyIfRange(ArgumentVerificationRule): + """ + Verify if the value is a correct range: (min, max) + + Parameters + ---------- + values : any + Value to verify. Should be a range: (min, max) + """ + + @classmethod + def verify(cls, *, values, name, **_) -> None: + if isinstance(values, (list, tuple)) and len(values) == 2 and values[0] > values[1]: + raise ValueError(f"Values {name} should be (min, max), got {values}") + + +class VerifSizeDescriptor(ArgumentVerificationRule): + """ + Verify if the value can describe a size argument, which is: + - an integer + - or a sequence of length of 1, + - or a sequence of length of 2 + + Parameters + ---------- + size : any + Value to verify. Should be an integer or a sequence of length 1 or 2. + """ + + @classmethod + def verify(cls, *, size, **_) -> None: + if not isinstance(size, (int, list, tuple)): + raise TypeError(f"Size must be int, list, or tuple, got {type(size)}") + elif isinstance(size, (list, tuple)) and len(size) > 2: + raise ValueError(f"Size sequence must have length 1 or 2, got {len(size)}") + VerifyIfPositive.verify(values=size, name="size") + + +class Operator(ABC): + """ + Abstract base class for operator specification + + Implement _kernel for algorithm specific processing + + ``arg_rules`` - a sequence of verification rules for algorithm's arguments. + ``input_rules`` - a sequence of verification rules for algorithm's input data. + ``preprocess_data`` - a function to preprocess the input data. + + Parameters + ---------- + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the operator. Can be ``"cpu"`` or ``"gpu"``. + **kwargs + Additional keyword arguments for the operator. + """ + + arg_rules: Sequence[ArgumentVerificationRule] = [] + input_rules: Sequence[DataVerificationRule] = [] + preprocess_data = None + + @classmethod + def verify_args(cls, **kwargs): + for rule in cls.arg_rules: + rule.verify(**kwargs) + + @classmethod + def verify_data(cls, data_input): + for rule in cls.input_rules: + rule.verify(data_input) + + def __init__(self, device: Literal["cpu", "gpu"] = "cpu", **kwargs): + self.device = device + type(self).verify_args(**kwargs) + + @abstractmethod + def _kernel(self, data_input): + """ + Algorithm's processing + """ + pass + + def __call__(self, data_input): + + type(self).verify_data(data_input) + + # Original input is transfered to GPU, before being preprocess_data. + # The preprocess_data creates an arbitrary tuple + if self.device == "gpu": + data_input = data_input.gpu() + + if type(self).preprocess_data: + data_input = type(self).preprocess_data(data_input) + + output = self._kernel(data_input) + + return output + + +def adjust_input(func): + """ + This decorator transforms the 1st argument of a function to internal DALI representation + according to the following rules: + - ``PIL.Image`` -> ``ndd.Tensor(layout = "HWC")`` + - ``torch.Tensor``: + - ``ndim == 3`` -> ``ndd.Tensor(layout = "CHW")`` + - ``ndim > 3`` -> ``ndd.Batch(layout = "CHW")`` + + Note: When new input types are supported this function will be extended. + """ + + def transform_input(inpt) -> ndd.Tensor | ndd.Batch: + """ + Transforms supported inputs to either DALI tensor or batch + The following conversion rules apply: + - PIL Image -> ndd.Tensor(layout="HWC"), depending on the number of channels it outputs: + L, RGB, or RGBA mode + - torch.Tensor: + ndim==3 -> ndd.Tensor(layout = "CHW"), + ndim>3 -> ndd.Batch(layout="CHW") (workaround for DALI-4566; intended: layout="NCHW") + """ + mode = "RGB" + if isinstance(inpt, Image.Image): + mode = inpt.mode + if mode == "L": + _input = ndd.Tensor(np.array(inpt, copy=True), layout="HW") + elif mode in ["RGB", "RGBA"]: # Modes RGB, RGBA + _input = ndd.Tensor(np.array(inpt, copy=True), layout="HWC") + else: + raise ValueError(f"Mode {mode} is not supported, expected, L, RGB, RGBA") + elif isinstance(inpt, torch.Tensor): + if inpt.ndim == 3: + _input = ndd.Tensor(inpt, layout="CHW") + elif inpt.ndim > 3: + # Creating baches of NHWC does not work, because of: + # https://jirasw.nvidia.com/browse/DALI-4566 + # It should be implemented as: + # _input = ndd.as_batch(inpt, layout="NCHW") + # currently workarounded as: + _input = ndd.as_batch(ndd.as_tensor(inpt), layout="CHW") + else: + raise TypeError(f"Tensor has < 3 dimensions: {inpt.ndim}, shape: {inpt.shape}") + else: + raise TypeError(f"Data type: {type(inpt)} is not supported") + + return _input, mode + + def adjust_output( + output: ndd.Tensor | ndd.Batch, inpt, mode: str = "RGB" + ) -> Image.Image | torch.Tensor: + """ + Adjusts output to match the original input type or operator's result + Depending on the inpt: + - PIL Image: output ndd.Tensor -> PIL Image with applicable mode ("L", "RGB", "RGBA") + - torch.Tensor: + ndd.Batch -> torch.Tensor with leading dimension as a number of samples in batch + ndd.Tensor -> torch.Tensor + """ + if isinstance(inpt, Image.Image): + if output.shape[-1] == 1: + output = np.asarray(output).squeeze(-1) + mode = "L" + return Image.fromarray(np.asarray(output), mode=mode) + elif isinstance(inpt, torch.Tensor): + # For input being torch.Tensor only ndd.Batch or ndd.Tensor is allowed as output + if isinstance(output, ndd.Batch): + output = ndd.as_tensor(output) + elif not isinstance(output, ndd.Tensor): + raise TypeError(f"Invalid output type: {type(output)}") + + # This is WAR for DLPpack not supporting pinned memory, see: + # https://github.com/pytorch/pytorch/issues/136250H + if output.device.device_type == "cpu": + output = np.asarray(output) + + return torch.as_tensor(output) + else: + return output + + def inner_function(inpt, *args, **kwargs): + + _input, mode = transform_input(inpt) + output = func(_input, *args, **kwargs) + + output = output.evaluate() + + return adjust_output(output, inpt, mode) + + return inner_function diff --git a/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py b/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py new file mode 100644 index 00000000000..a8450734805 --- /dev/null +++ b/dali/python/nvidia/dali/experimental/torchvision/v2/resize.py @@ -0,0 +1,301 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Sequence, Literal + +from .operator import Operator, ArgumentVerificationRule + +import nvidia.dali as dali +import nvidia.dali.fn as fn +from nvidia.dali.types import DALIInterpType + +from torchvision.transforms import InterpolationMode +import numpy as np + + +def get_inputHW(data_input): + """ + Gets the height and width of the input data. + + Parameters + ---------- + data_input : Tensor + Input data to get the height and width of. + + Returns + ------- + input_height : int + Height of the input data. + input_width : int + Width of the input data. + """ + layout = data_input.property("layout")[0] + + # If data layout is NHWC or NCHW, check the next character + if layout == np.frombuffer(bytes("N", "utf-8"), dtype=np.uint8)[0]: + layout = data_input.property("layout")[1] + + # CHW + if layout == np.frombuffer(bytes("C", "utf-8"), dtype=np.uint8)[0]: + input_height = data_input.shape()[-2] + input_width = data_input.shape()[-1] + # HWC + else: + input_height = data_input.shape()[-3] + input_width = data_input.shape()[-2] + + return input_height, input_width, data_input + + +class VerificationSize(ArgumentVerificationRule): + @classmethod + def verify(cls, *, size, max_size, interpolation, **_): + if size is not None and not isinstance(size, int) and not isinstance(size, (tuple, list)): + raise ValueError( + "Invalid combination: size must be int, None, or sequence of two ints. " + "max_size only applies when size is int or None." + ) + if size is None and max_size is None: + raise ValueError("Must provide max_size if size is None.") + if size is not None and max_size is not None and np.min(size) > max_size: + raise ValueError("max_size should not be smaller than the actual size") + if max_size is not None and np.min(max_size) < 0: + raise ValueError(f"max_size must not be smaller than 0, got{max_size}") + if isinstance(size, (tuple, list)) and len(size) == 2 and max_size is not None: + raise ValueError( + "max_size should only be passed if size specifies the length of the smaller \ + edge, i.e. size should be an int" + ) + + if interpolation in Resize.not_supported_interpolation_modes: + raise NotImplementedError(f"Interpolation mode: {interpolation} is not supported") + + if interpolation not in Resize.interpolation_modes.keys(): + raise ValueError(f"Interpolation {type(interpolation)} is not supported") + + +class Resize(Operator): + """ + Resize the input image to the given size + If the image is torch Tensor, it is expected to have […, H, W] shape, where … means a maximum + of two leading dimensions + + Parameters + ---------- + size:sequence or int + Desired output size. If size is a sequence like (h, w), output size will be matched + to this. If size is an int, smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to (size * height / width, size). + interpolation : InterpolationMode or int + ``torchvision.transforms.InterpolationMode``. Default is InterpolationMode.BILINEAR. + If input is Tensor, only ``InterpolationMode.NEAREST``, + ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and + ``InterpolationMode.BICUBIC`` are supported. + max_size : int, optional + The maximum allowed for the longer edge of the resized image. If the longer edge of + the image is greater than max_size after being resized according to size, size will + be overruled so that the longer edge is equal to max_size. As a result, the smaller + edge may be shorter than size. This is only supported if size is an int. + antialias : bool, optional + Whether to apply antialiasing. If ``True``, antialiasing will be applied. If ``False``, + antialiasing will not be applied. + device : Literal["cpu", "gpu"], optional, default = "cpu" + Device to use for the resize. Can be ``"cpu"`` or ``"gpu"``. + """ + + # 'NEAREST', 'NEAREST_EXACT', 'BILINEAR', 'BICUBIC', 'BOX', 'HAMMING', 'LANCZOS' + interpolation_modes = { + InterpolationMode.NEAREST: DALIInterpType.INTERP_NN, + InterpolationMode.BILINEAR: DALIInterpType.INTERP_LINEAR, + InterpolationMode.BICUBIC: DALIInterpType.INTERP_CUBIC, + InterpolationMode.LANCZOS: DALIInterpType.INTERP_LANCZOS3, + # Not supported, but need to be here to not generate ValueError during VerificationSize + InterpolationMode.NEAREST_EXACT: DALIInterpType.INTERP_NN, + InterpolationMode.BOX: DALIInterpType.INTERP_NN, + InterpolationMode.HAMMING: DALIInterpType.INTERP_NN, + } + + not_supported_interpolation_modes = [ + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BOX, + InterpolationMode.HAMMING, + ] + + arg_rules = [VerificationSize] + preprocess_data = get_inputHW + + @classmethod + def infer_effective_size( + cls, + size: Optional[int | Sequence[int]], + max_size: Optional[int] = None, + ) -> Optional[int | Sequence[int]]: + """Normalizes the size parameter. Called once at initialization. + + Returns the size in a canonical form: + + - ``int`` — resize the shorter edge to this value (aspect-ratio preserving) + - ``None`` — use ``max_size`` only (resize so longer edge equals ``max_size``) + - ``(h, w)`` tuple/list — resize to the exact target dimensions + """ + if isinstance(size, (tuple, list)) and len(size) == 1: + size = size[0] + return size + + @classmethod + def calculate_target_size_dynamic_mode( + cls, + orig_size: Sequence[int], + size: Optional[int | Sequence[int]], + max_size: Optional[int], + ): + """Computes the output ``(out_h, out_w)`` compatible with ``torchvision.v2.Resize``. + + Called per resize invocation with the actual input shape. + + Note: This method needs to be called only when in Dynamic Mode + Unfortunately, both method are needed because of graph creation struggles with proper + translation of class methods calls + """ + orig_h = orig_size[0] + orig_w = orig_size[1] + + if isinstance(size, (tuple, list)): + # Exact target dimensions — return directly + return size[0], size[1] + + if size is None: + # Only max_size given: resize so the longer edge equals max_size + if orig_h >= orig_w: + return max_size, int(max_size * orig_w / orig_h) + else: + return int(max_size * orig_h / orig_w), max_size + + # size is int: resize the shorter edge to size, maintaining aspect ratio + s = size + if orig_h <= orig_w: + # height is the shorter (or equal) edge + out_h = s + out_w = int(s * orig_w / orig_h) + if max_size is not None and out_w > max_size: + out_h = int(max_size * out_h / out_w) + out_w = max_size + else: + # width is the shorter edge + out_h = int(s * orig_h / orig_w) + out_w = s + if max_size is not None and out_h > max_size: + out_w = int(max_size * out_w / out_h) + out_h = max_size + + return out_h, out_w + + @classmethod + def calculate_target_size_pipeline_mode( + cls, + orig_size: Sequence[int], + size: Optional[int | Sequence[int]], + max_size: Optional[int], + ): + """Computes the output ``(out_h, out_w)`` compatible with ``torchvision.v2.Resize``. + + Called per resize invocation with the actual input shape. + + Note: This method needs to be called only when in Pipeline Mode + """ + orig_h = orig_size[0] + orig_w = orig_size[1] + + if isinstance(size, (tuple, list)): + # Exact target dimensions — return directly + return size[0], size[1] + + if size is None: + # Only max_size given: resize so the longer edge equals max_size + if orig_h >= orig_w: + return max_size, fn.cast( + dali.math.floor(max_size * orig_w / orig_h), dtype=dali.types.INT32 + ) + else: + return ( + fn.cast(dali.math.floor(max_size * orig_h / orig_w), dtype=dali.types.INT32), + max_size, + ) + + # size is int: resize the shorter edge to size, maintaining aspect ratio + s = size + if orig_h <= orig_w: + # height is the shorter (or equal) edge + out_h = s + out_w = fn.cast(dali.math.floor(s * orig_w / orig_h), dtype=dali.types.INT32) + if max_size is not None and out_w > max_size: + out_h = fn.cast(dali.math.floor(max_size * out_h / out_w), dtype=dali.types.INT32) + out_w = max_size + else: + # width is the shorter edge + out_h = fn.cast(dali.math.floor(s * orig_h / orig_w), dtype=dali.types.INT32) + out_w = s + if max_size is not None and out_h > max_size: + out_w = fn.cast(dali.math.floor(max_size * out_w / out_h), dtype=dali.types.INT32) + out_h = max_size + + return out_h, out_w + + def __init__( + self, + size: Optional[int | Sequence[int]], + interpolation: InterpolationMode | int = InterpolationMode.BILINEAR, + max_size: Optional[int] = None, + antialias: Optional[bool] = True, + device: Literal["cpu", "gpu"] = "cpu", + ): + + super().__init__( + device=device, + size=size, + max_size=max_size, + interpolation=interpolation, + ) + + self.size = size + self.max_size = max_size + self.interpolation = Resize.interpolation_modes[interpolation] + self.size_normalized = Resize.infer_effective_size(size, max_size) + self.antialias = antialias + + def _kernel(self, data_input): + """ + Performs the resize. The method infers the requested size in compliance + with ``torchvision.transforms.Resize`` documentation and applies DALI operator on the + ``data_input``. + """ + + in_h, in_w, data_input = data_input + + target_h, target_w = Resize.calculate_target_size_pipeline_mode( + (in_h, in_w), + self.size_normalized, + self.max_size, + ) + + return fn.resize( + data_input, + device=self.device, + size=fn.stack( + fn.cast(target_h, dtype=dali.types.FLOAT), + fn.cast(target_w, dtype=dali.types.FLOAT), + ), + interp_type=self.interpolation, + antialias=self.antialias, + ) diff --git a/dali/test/python/torchvision/test_tv_compose.py b/dali/test/python/torchvision/test_tv_compose.py new file mode 100644 index 00000000000..cc2792dcd59 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_compose.py @@ -0,0 +1,203 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from nvidia.dali.experimental.torchvision import ( + Compose, + RandomHorizontalFlip, + RandomVerticalFlip, + Resize, +) + +from nose2.tools import params +from nose_utils import assert_raises +import numpy as np +from PIL import Image +import torchvision.transforms.v2 as tv +import torch + + +def read_filepath(path): + return np.frombuffer(path.encode(), dtype=np.int8) + + +def make_test_tensor(shape=(5, 10, 10, 1)): + total = 1 + for s in shape: + total *= s + return torch.arange(total).reshape(shape).to(dtype=torch.uint8) + + +dali_extra = os.environ["DALI_EXTRA_PATH"] +jpeg = os.path.join(dali_extra, "db", "single", "jpeg") +jpeg_113 = os.path.join(jpeg, "113") +test_files = [ + os.path.join(jpeg_113, f) + for f in ["snail-4291306_1280.jpg", "snail-4345504_1280.jpg", "snail-4368154_1280.jpg"] +] +test_input_filenames = [read_filepath(fname) for fname in test_files] + + +def test_compose_tensor(): + test_tensor = make_test_tensor(shape=(5, 3, 5, 5)) + dali_pipeline = Compose([RandomHorizontalFlip(p=1.0)], batch_size=test_tensor.shape[0]) + dali_out = dali_pipeline(test_tensor) + tv_out = tv.RandomHorizontalFlip(p=1.0)(test_tensor) + + assert isinstance(dali_out, torch.Tensor) + assert torch.equal(dali_out, tv_out) + + +def test_compose_multi_tensor(): + test_tensor = make_test_tensor(shape=(5, 3, 5, 5)) + dali_pipeline = Compose( + [Resize(size=(15, 15)), RandomHorizontalFlip(p=1.0), RandomVerticalFlip(p=1.0)], + batch_size=test_tensor.shape[0], + ) + dali_out = dali_pipeline(test_tensor) + tv_pipeline = tv.Compose( + [tv.Resize(size=(15, 15)), tv.RandomHorizontalFlip(p=1.0), tv.RandomVerticalFlip(p=1.0)] + ) + tv_out = tv_pipeline(test_tensor) + + assert isinstance(dali_out, torch.Tensor) + # All close, because there are pixel differences due to resize + assert torch.allclose(dali_out, tv_out, rtol=0, atol=1), f"Should be {tv_out} is {dali_out}" + + +def test_compose_invalid_batch_tensor(): + test_tensor = make_test_tensor(shape=(5, 1, 5, 5)) + with assert_raises(RuntimeError): + dali_pipeline = Compose([RandomHorizontalFlip(p=1.0)], batch_size=1) + _ = dali_pipeline(test_tensor) + + +def test_compose_images(): + dali_transform = Compose([RandomHorizontalFlip(p=1.0)]) + tv_transform = tv.Compose([tv.RandomHorizontalFlip(p=1.0)]) + + for fn in test_files: + img = Image.open(fn) + out_dali_img = dali_transform(img) + + assert isinstance(out_dali_img, Image.Image) + + tensor_dali_tv = tv.functional.pil_to_tensor(out_dali_img) + tensor_tv = tv.functional.pil_to_tensor(tv_transform(img)) + + assert tensor_dali_tv.shape == tensor_tv.shape + assert torch.equal(tensor_dali_tv, tensor_tv) + + +def test_compose_images_multi(): + dali_transform = Compose([RandomVerticalFlip(p=1.0), RandomHorizontalFlip(p=1.0)]) + tv_transform = tv.Compose([tv.RandomVerticalFlip(p=1.0), tv.RandomHorizontalFlip(p=1.0)]) + + for fn in test_files: + img = Image.open(fn) + out_dali_img = dali_transform(img) + + assert isinstance(out_dali_img, Image.Image) + + tensor_dali_tv = tv.functional.pil_to_tensor(out_dali_img) + tensor_tv = tv.functional.pil_to_tensor(tv_transform(img)) + + assert tensor_dali_tv.shape == tensor_tv.shape + assert torch.equal(tensor_dali_tv, tensor_tv) + + +def test_compose_invalid_type_images(): + dali_transform = Compose([RandomHorizontalFlip(p=1.0)]) + + for fn in test_files: + img = Image.open(fn) + with assert_raises(TypeError): + _ = dali_transform([img, img, img]) + + +def _make_pil_image(mode, h=50, w=60, seed=42): + rng = np.random.default_rng(seed) + if mode == "L": + data = rng.integers(0, 256, (h, w), dtype=np.uint8) + elif mode == "RGB": + data = rng.integers(0, 256, (h, w, 3), dtype=np.uint8) + elif mode == "RGBA": + data = rng.integers(0, 256, (h, w, 4), dtype=np.uint8) + else: + raise ValueError(f"Unsupported mode: {mode}") + return Image.fromarray(data, mode=mode) + + +@params("RGB", "L", "RGBA") +def test_compose_pil_mode_flip(mode): + """Horizontal flip must produce a pixel-exact match with torchvision for all PIL modes.""" + img = _make_pil_image(mode) + dali_transform = Compose([RandomHorizontalFlip(p=1.0)]) + tv_transform = tv.Compose([tv.RandomHorizontalFlip(p=1.0)]) + + out_dali = dali_transform(img) + out_tv = tv_transform(img) + + assert isinstance(out_dali, Image.Image) + assert out_dali.mode == mode, f"Mode changed: expected {mode}, got {out_dali.mode}" + assert torch.equal( + tv.functional.pil_to_tensor(out_dali), + tv.functional.pil_to_tensor(out_tv), + ), f"Pixel mismatch for mode {mode}" + + +@params("RGB", "L", "RGBA") +def test_compose_pil_mode_resize(mode): + """Resize must produce the correct output shape and preserve PIL mode.""" + img = _make_pil_image(mode) + target = (30, 40) + dali_transform = Compose([Resize(size=target)]) + tv_transform = tv.Compose([tv.Resize(size=target)]) + + out_dali = dali_transform(img) + out_tv = tv_transform(img) + + assert isinstance(out_dali, Image.Image) + assert out_dali.mode == mode, f"Mode changed: expected {mode}, got {out_dali.mode}" + # PIL size is (w, h); compare as (h, w) to match the target convention + assert ( + out_dali.size == out_tv.size + ), f"Size mismatch for mode {mode}: {out_dali.size} != {out_tv.size}" + + +@params("RGB", "L", "RGBA") +def test_compose_pil_mode_multi_op(mode): + """Chained flip+resize must preserve mode and match torchvision output shape.""" + img = _make_pil_image(mode) + dali_transform = Compose([Resize(size=(30, 40)), RandomHorizontalFlip(p=1.0)]) + tv_transform = tv.Compose([tv.Resize(size=(30, 40)), tv.RandomHorizontalFlip(p=1.0)]) + + out_dali = dali_transform(img) + out_tv = tv_transform(img) + + assert isinstance(out_dali, Image.Image) + assert out_dali.mode == mode, f"Mode changed: expected {mode}, got {out_dali.mode}" + assert ( + out_dali.size == out_tv.size + ), f"Size mismatch for mode {mode}: {out_dali.size} != {out_tv.size}" + + +@params("RGB", "L", "RGBA") +def test_compose_pil_invalid_input_type_raises(mode): + """Passing a list instead of a PIL Image must raise TypeError regardless of mode.""" + img = _make_pil_image(mode) + dali_transform = Compose([RandomHorizontalFlip(p=1.0)]) + with assert_raises(TypeError): + _ = dali_transform([img, img]) diff --git a/dali/test/python/torchvision/test_tv_flips.py b/dali/test/python/torchvision/test_tv_flips.py new file mode 100644 index 00000000000..67869169989 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_flips.py @@ -0,0 +1,71 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torchvision.transforms.v2 as tv + +from nose2.tools import params +from nvidia.dali.experimental.torchvision import Compose, RandomHorizontalFlip, RandomVerticalFlip +from nvidia.dali.experimental.torchvision.v2.functional import horizontal_flip, vertical_flip + + +def make_test_tensor(shape=(1, 3, 10, 10)): + total = 1 + for s in shape: + total *= s + return torch.arange(total).reshape(shape) + + +@params("gpu", "cpu") +def test_horizontal_random_flip_probability(device): + img = make_test_tensor() + transform = Compose([RandomHorizontalFlip(p=1.0, device=device)]) # always flip + out = transform(img).cpu() + out_tv = tv.RandomHorizontalFlip(p=1.0)(img) + out_fn = horizontal_flip(img).cpu() + assert torch.equal(out, out_tv) + assert torch.equal(out_fn, out_tv) + + transform = Compose([RandomHorizontalFlip(p=0.0, device=device)]) # never flip + out = transform(img).cpu() + assert torch.equal(out, img) + + +@params("gpu", "cpu") +def test_vertical_random_flip_probability(device): + img = make_test_tensor() + transform = Compose([RandomVerticalFlip(p=1.0, device=device)]) # always flip + out = transform(img).cpu() + out_tv = tv.RandomVerticalFlip(p=1.0)(img) + out_fn = vertical_flip(img).cpu() + assert torch.equal(out, out_tv) + assert torch.equal(out, out_fn) + + transform = Compose([RandomVerticalFlip(p=0.0, device=device)]) # never flip + out = transform(img).cpu() + assert torch.equal(out, img) + + +def test_flip_preserves_shape(): + img = make_test_tensor((1, 3, 15, 20)) + hflip_pipeline = Compose([RandomHorizontalFlip(p=1.0)]) + hflip_fn = horizontal_flip(img).cpu() + hflip = hflip_pipeline(img) + vflip_pipeline = Compose([RandomVerticalFlip(p=1.0)]) + vflip_fn = vertical_flip(img).cpu() + vflip = vflip_pipeline(img) + assert hflip.shape == img.shape + assert vflip.shape == img.shape + assert hflip_fn.shape == img.shape + assert vflip_fn.shape == img.shape diff --git a/dali/test/python/torchvision/test_tv_resize.py b/dali/test/python/torchvision/test_tv_resize.py new file mode 100644 index 00000000000..cadaa331090 --- /dev/null +++ b/dali/test/python/torchvision/test_tv_resize.py @@ -0,0 +1,281 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Sequence, Literal, Union + +import numpy as np +from nose2.tools import params, cartesian_params +from nose_utils import assert_raises +from PIL import Image +import torch +import torchvision.transforms.v2 as transforms +import torchvision.transforms.v2.functional as fn_tv + +from nvidia.dali.experimental.torchvision import Resize, Compose +import nvidia.dali.experimental.torchvision.v2.functional as fn_dali + + +def read_filepath(path): + return np.frombuffer(path.encode(), dtype=np.int8) + + +dali_extra = os.environ["DALI_EXTRA_PATH"] +jpeg = os.path.join(dali_extra, "db", "single", "jpeg") +jpeg_113 = os.path.join(jpeg, "113") +test_files = [ + os.path.join(jpeg_113, f) + for f in ["snail-4291306_1280.jpg", "snail-4345504_1280.jpg", "snail-4368154_1280.jpg"] +] +test_input_filenames = [read_filepath(fname) for fname in test_files] + + +def build_resize_transform( + resize: int | Sequence[int], + max_size: int = None, + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, + device: Literal["cpu", "gpu"] = "cpu", +): + t = transforms.Compose( + [ + transforms.Resize( + size=resize, max_size=max_size, interpolation=interpolation, antialias=antialias + ), + ] + ) + td = Compose( + [ + Resize( + size=resize, + max_size=max_size, + interpolation=interpolation, + antialias=antialias, + device=device, + ), + ] + ) + return t, td + + +def _internal_loop( + input_data: Union[Image.Image, torch.Tensor], + t: transforms.Resize, + td: Resize, + resize: int | Sequence[int], + max_size: int = None, + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, +): + out_fn = fn_tv.resize( + input_data, + size=resize, + max_size=max_size, + interpolation=interpolation, + antialias=antialias, + ) + out_dali_fn = fn_dali.resize( + input_data, + size=resize, + max_size=max_size, + interpolation=interpolation, + antialias=antialias, + ) + out_tv = t(input_data) + out_dali_tv = td(input_data) + + if isinstance(input_data, Image.Image): + out_tv = transforms.functional.pil_to_tensor(out_tv).unsqueeze(0).permute(0, 2, 3, 1) + out_dali_tv = ( + transforms.functional.pil_to_tensor(out_dali_tv).unsqueeze(0).permute(0, 2, 3, 1) + ) + out_fn = transforms.functional.pil_to_tensor(out_fn) + out_dali_fn = transforms.functional.pil_to_tensor(out_dali_fn) + + assert ( + out_tv.shape[1:3] == out_dali_tv.shape[1:3] + ), f"Should be:{out_tv.shape} is:{out_dali_tv.shape}" + assert ( + out_fn.shape[1:3] == out_dali_fn.shape[1:3] + ), f"Should be:{out_fn.shape} is:{out_dali_fn.shape}" + + # TODO: + # assert torch.allclose(out_tv, out_dali_tv, rtol=1, atol=1) + # assert torch.allclose(out_fn, out_dali_fn, rtol=1, atol=1) + + +def loop_images_test_no_build( + t: transforms.Resize, + td: Resize, + resize: int | Sequence[int], + max_size: int = None, + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, +): + for fn in test_files: + img = Image.open(fn) + _internal_loop(img, t, td, resize, max_size, interpolation, antialias) + + +def build_tensors(max_size: int = 512, channels: int = 3, seed=12345): + + torch.manual_seed(seed) + + h = torch.randint(10, max_size, (1,)).item() + w = torch.randint(10, max_size, (1,)).item() + tensors = [ + torch.ones((channels, max_size, max_size)), + torch.ones((1, channels, max_size, max_size)), + torch.ones((10, channels, max_size, max_size)), + torch.ones((channels, max_size // 2, max_size)), + torch.ones((1, channels, max_size // 2, max_size)), + torch.ones((10, channels, max_size // 2, max_size)), + torch.ones((channels, max_size, max_size // 2)), + torch.ones((1, channels, max_size, max_size // 2)), + torch.ones((10, channels, max_size, max_size // 2)), + torch.ones((channels, h, w)), + torch.ones((1, channels, h, w)), + torch.ones((10, channels, h, w)), + ] + + return tensors + + +def loop_tensors_test( + resize: int | Sequence[int], + max_size: int = None, + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, + device: Literal["cpu", "gpu"] = "cpu", +): + t, td = build_resize_transform(resize, max_size, interpolation, antialias, device) + tensors = build_tensors() + + for tn in tensors: + _internal_loop(tn, t, td, resize, max_size, interpolation, antialias) + + +def loop_images_test( + resize: int | Sequence[int], + max_size: int = None, + interpolation: transforms.InterpolationMode = transforms.InterpolationMode.BILINEAR, + antialias: bool = False, + device: Literal["cpu", "gpu"] = "cpu", +): + t, td = build_resize_transform(resize, max_size, interpolation, antialias, device) + loop_images_test_no_build(t, td, resize, max_size, interpolation, antialias) + + +@cartesian_params((512, 1125, 2048, ([512, 512]), ([2048, 2048])), ("cpu", "gpu")) +def test_resize_sizes_images(resize, device): + # Resize with single int (preserve aspect ratio) + loop_images_test(resize=resize, device=device) + + +@cartesian_params((512, 1125, 2048, ([512, 512]), ([2048, 2048])), ("cpu", "gpu")) +def test_resize_sizes_tensors(resize, device): + # Resize with single int (preserve aspect ratio) + loop_tensors_test(resize=resize, device=device) + + +@params((480, 512), (100, 124), (None, 512), (1024, 512), ([256, 256], 512), (None, None)) +def test_resize_max_sizes(resize, max_size): + # Resize with single int (preserve aspect ratio) + if resize is not None and max_size is not None and np.min(np.array(resize, int)) > max_size: + + """ + with assert_raises(ValueError): + _ = transforms.Resize(resize, max_size) + This exception is called later - when executing the operation + """ + + with assert_raises(ValueError): + _ = Compose( + [ + Resize(resize, max_size=max_size), + ] + ) + return + if resize is None and max_size is None: + with assert_raises(ValueError): + _ = transforms.Resize(resize, max_size) + + with assert_raises(ValueError): + _ = Compose( + [ + Resize(resize, max_size=max_size), + ] + ) + return + + if isinstance(resize, Sequence) and len(resize) != 1 and max_size is not None: + """ + with assert_raises(ValueError): + _ = transforms.Resize(resize, max_size) + This exception is called later - when executing the operation + """ + + with assert_raises(ValueError): + _ = Compose( + [ + Resize(resize, max_size=max_size), + ] + ) + return + + loop_images_test(resize=resize, max_size=max_size) + + +@cartesian_params( + ( + 640, + 768, + 1024, + ([512, 512]), + ([256, 256]), + ), + ( + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.NEAREST_EXACT, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, + ), + ("cpu", "gpu"), +) +def test_resize_interpolation(resize, interpolation, device): + if interpolation == transforms.InterpolationMode.NEAREST_EXACT: + with assert_raises(NotImplementedError): + loop_images_test(resize=resize, interpolation=interpolation, device=device) + else: + loop_images_test(resize=resize, interpolation=interpolation, device=device) + + +@cartesian_params((512, 768, 2048, ([512, 512]), ([2048, 2048])), (True, False), ("cpu", "gpu")) +def test_resize_antialiasing(resize, antialiasing, device): + loop_images_test(resize=resize, antialias=antialiasing, device=device) + + +@cartesian_params((8192, 8193, 10243), ("cpu", "gpu")) +def test_large_sizes_images(resize, device): + loop_images_test(resize=resize, device=device) + + +""" +These tests are too heavy they would cause timeouts + +@cartesian_params((8192, 8193, 10243), ("cpu", "gpu")) +def test_large_sizes_tensors(resize, device): + # Resize with single int (preserve aspect ratio) + loop_tensors_test(resize=resize, device=device) +"""