diff --git a/docs/source/video_load.mdx b/docs/source/video_load.mdx index 965b28f8822..d9c2e9abcca 100644 --- a/docs/source/video_load.mdx +++ b/docs/source/video_load.mdx @@ -3,12 +3,12 @@ > [!WARNING] > Video support is experimental and is subject to change. -Video datasets have [`Video`] type columns, which contain `torchvision` objects. +Video datasets have [`Video`] type columns, which contain `torchcodec` objects. > [!TIP] -> To work with video datasets, you need to have the `torchvision` and `av` packages installed. Check out the [installation](https://github.com/pytorch/vision#installation) guide to learn how to install them. +> To work with video datasets, you need to have the `torchcodec` and `ffmpeg` packages installed. Check out the [installation](https://github.com/meta-pytorch/torchcodec#installing-torchcodec) guide to learn how to install them. -When you load a video dataset and call the video column, the videos are decoded as `torchvision` Videos: +When you load a video dataset and call the video column, the videos are decoded as `torchcodec` Videos: ```py >>> from datasets import load_dataset, Video @@ -193,7 +193,7 @@ For more details on working with Lance datasets, see the [Lance documentation](h ## Video decoding -By default, videos are decoded sequentially as torchvision `VideoReaders` when you iterate on a dataset. +By default, videos are decoded sequentially as torchcodec `VideoDecoders` when you iterate on a dataset. It sequentially decodes the metadata of the videos, and doesn't read the video frames until you access them. However it is possible to speed up the dataset significantly using multithreaded decoding: @@ -209,7 +209,7 @@ However it is possible to speed up the dataset significantly using multithreaded You can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming. However it can be slower than `num_threads=0` for local data on fast disks. -If you are not interested in the videos decoded as torchvision `VideoReaders` and would like to access the path/bytes instead, you can disable decoding: +If you are not interested in the videos decoded as torchcodec `VideoDecoders` and would like to access the path/bytes instead, you can disable decoding: ```python >>> dataset = dataset.decode(False) diff --git a/src/datasets/formatting/jax_formatter.py b/src/datasets/formatting/jax_formatter.py index c52ef7a4d59..8d0cad4e38c 100644 --- a/src/datasets/formatting/jax_formatter.py +++ b/src/datasets/formatting/jax_formatter.py @@ -107,10 +107,13 @@ def _tensorize(self, value): if isinstance(value, PIL.Image.Image): value = np.asarray(value) if config.TORCHVISION_AVAILABLE and "torchvision" in sys.modules: - from torchvision.io import VideoReader + try: + from torchvision.io import VideoReader - if isinstance(value, VideoReader): - return value # TODO(QL): set output to jax arrays ? + if isinstance(value, VideoReader): + return value # TODO(QL): set output to jax arrays ? + except ImportError: + pass if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: from torchcodec.decoders import AudioDecoder, VideoDecoder diff --git a/src/datasets/formatting/np_formatter.py b/src/datasets/formatting/np_formatter.py index 062d199c6f6..eee14a35eeb 100644 --- a/src/datasets/formatting/np_formatter.py +++ b/src/datasets/formatting/np_formatter.py @@ -64,10 +64,13 @@ def _tensorize(self, value): if isinstance(value, PIL.Image.Image): return np.asarray(value, **self.np_array_kwargs) if config.TORCHVISION_AVAILABLE and "torchvision" in sys.modules: - from torchvision.io import VideoReader + try: + from torchvision.io import VideoReader - if isinstance(value, VideoReader): - return value # TODO(QL): set output to np arrays ? + if isinstance(value, VideoReader): + return value # TODO(QL): set output to np arrays ? + except ImportError: + pass if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: from torchcodec.decoders import AudioDecoder, VideoDecoder diff --git a/src/datasets/formatting/tf_formatter.py b/src/datasets/formatting/tf_formatter.py index 1a20eb31d1d..028b998d5d1 100644 --- a/src/datasets/formatting/tf_formatter.py +++ b/src/datasets/formatting/tf_formatter.py @@ -71,10 +71,13 @@ def _tensorize(self, value): if isinstance(value, PIL.Image.Image): value = np.asarray(value) if config.TORCHVISION_AVAILABLE and "torchvision" in sys.modules: - from torchvision.io import VideoReader + try: + from torchvision.io import VideoReader - if isinstance(value, VideoReader): - return value # TODO(QL): set output to tf tensors ? + if isinstance(value, VideoReader): + return value # TODO(QL): set output to tf tensors ? + except ImportError: + pass if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: from torchcodec.decoders import AudioDecoder, VideoDecoder diff --git a/src/datasets/formatting/torch_formatter.py b/src/datasets/formatting/torch_formatter.py index 3501f9368be..a41dac9ed66 100644 --- a/src/datasets/formatting/torch_formatter.py +++ b/src/datasets/formatting/torch_formatter.py @@ -77,10 +77,13 @@ def _tensorize(self, value): value = value.transpose((2, 0, 1)) if config.TORCHVISION_AVAILABLE and "torchvision" in sys.modules: - from torchvision.io import VideoReader + try: + from torchvision.io import VideoReader - if isinstance(value, VideoReader): - return value # TODO(QL): set output to torch tensors ? + if isinstance(value, VideoReader): + return value # TODO(QL): set output to torch tensors ? + except ImportError: + pass if config.TORCHCODEC_AVAILABLE and "torchcodec" in sys.modules: from torchcodec.decoders import AudioDecoder, VideoDecoder diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 3f5082f69b0..cc0dde52979 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -4090,7 +4090,7 @@ def decode(self, enable: bool = True, num_threads: int = 0) -> "IterableDataset" * audio -> dict of "array" and "sampling_rate" and "path" * image -> PIL.Image - * video -> torchvision.io.VideoReader + * video -> torchcodec.decoders.VideoDecoder You can enable multithreading using `num_threads`. This is especially useful to speed up remote data streaming. However it can be slower than `num_threads=0` for local data on fast disks.