From 81c4e2a5beb0b58b2bfbc2ecef2f97cc42f2cc09 Mon Sep 17 00:00:00 2001 From: Samuel Stanton Date: Wed, 7 Aug 2024 15:11:52 -0400 Subject: [PATCH 1/3] remove torchtext dep --- cortex/model/root/_conv1d_root.py | 3 +-- cortex/transforms/__init__.py | 2 ++ cortex/transforms/_pad_transform.py | 33 +++++++++++++++++++++++++++++ cortex/transforms/_to_tensor.py | 31 +++++++++++++++++++++++++++ requirements.in | 1 - requirements.txt | 6 ------ 6 files changed, 67 insertions(+), 9 deletions(-) create mode 100644 cortex/transforms/_pad_transform.py create mode 100644 cortex/transforms/_to_tensor.py diff --git a/cortex/model/root/_conv1d_root.py b/cortex/model/root/_conv1d_root.py index 046dd54..b79f157 100644 --- a/cortex/model/root/_conv1d_root.py +++ b/cortex/model/root/_conv1d_root.py @@ -6,13 +6,12 @@ import numpy as np import torch from torch import LongTensor, nn -from torchtext.transforms import PadTransform, ToTensor from cortex.corruption import CorruptionProcess, GaussianCorruptionProcess, MaskCorruptionProcess from cortex.model.block import Conv1dResidBlock from cortex.model.elemental import Apply, Expression, SinePosEncoder, permute_spatial_channel_dims from cortex.model.root import RootNode, RootNodeOutput -from cortex.transforms import HuggingFaceTokenizerTransform +from cortex.transforms import HuggingFaceTokenizerTransform, PadTransform, ToTensor @dataclass diff --git a/cortex/transforms/__init__.py b/cortex/transforms/__init__.py index 3e32e6a..26a7713 100644 --- a/cortex/transforms/__init__.py +++ b/cortex/transforms/__init__.py @@ -1,2 +1,4 @@ from ._hf_tokenizer_transform import HuggingFaceTokenizerTransform +from ._pad_transform import PadTransform +from ._to_tensor import ToTensor from ._transform import Transform diff --git a/cortex/transforms/_pad_transform.py b/cortex/transforms/_pad_transform.py new file mode 100644 index 0000000..690bf59 --- /dev/null +++ b/cortex/transforms/_pad_transform.py @@ -0,0 +1,33 @@ +# copied from https://github.com/pytorch/text/blob/main/torchtext/transforms.py +# torchtext is no longer maintained and is incompatible with torch >= 2.4 +import torch +from torch import Tensor +from torch.nn import Module + + +class PadTransform(Module): + """Pad tensor to a fixed length with given padding value. + + :param max_length: Maximum length to pad to + :type max_length: int + :param pad_value: Value to pad the tensor with + :type pad_value: bool + """ + + def __init__(self, max_length: int, pad_value: int) -> None: + super().__init__() + self.max_length = max_length + self.pad_value = float(pad_value) + + def forward(self, x: Tensor) -> Tensor: + """ + :param x: The tensor to pad + :type x: Tensor + :return: Tensor padded up to max_length with pad_value + :rtype: Tensor + """ + max_encoded_length = x.size(-1) + if max_encoded_length < self.max_length: + pad_amount = self.max_length - max_encoded_length + x = torch.nn.functional.pad(x, (0, pad_amount), value=self.pad_value) + return x diff --git a/cortex/transforms/_to_tensor.py b/cortex/transforms/_to_tensor.py new file mode 100644 index 0000000..fda1578 --- /dev/null +++ b/cortex/transforms/_to_tensor.py @@ -0,0 +1,31 @@ +# copied from https://github.com/pytorch/text/blob/main/torchtext/transforms.py +# torchtext is no longer maintained and is incompatible with torch >= 2.4 +from typing import Any, Optional + +import torch +import torch.nn.functional as F +from torch import Tensor +from torch.nn import Module + + +class ToTensor(Module): + r"""Convert input to torch tensor + + :param padding_value: Pad value to make each input in the batch of length equal to the longest sequence in the batch. + :type padding_value: Optional[int] + :param dtype: :class:`torch.dtype` of output tensor + :type dtype: :class:`torch.dtype` + """ + + def __init__(self, padding_value: Optional[int] = None, dtype: torch.dtype = torch.long) -> None: + super().__init__() + self.padding_value = padding_value + self.dtype = dtype + + def forward(self, input: Any) -> Tensor: + """ + :param input: Sequence or batch of token ids + :type input: Union[List[int], List[List[int]]] + :rtype: Tensor + """ + return F.to_tensor(input, padding_value=self.padding_value, dtype=self.dtype) diff --git a/requirements.in b/requirements.in index 21973fb..8dd7717 100644 --- a/requirements.in +++ b/requirements.in @@ -1,5 +1,4 @@ torch -torchtext torchvision pytorch_warmup botorch >= 0.9.4 diff --git a/requirements.txt b/requirements.txt index 57828cd..90f1408 100644 --- a/requirements.txt +++ b/requirements.txt @@ -164,7 +164,6 @@ numpy==1.26.1 # scikit-learn # scipy # torchmetrics - # torchtext # torchvision # transformers nvidia-cublas-cu12==12.1.3.1 @@ -275,7 +274,6 @@ requests==2.31.0 # lightning # lightning-cloud # torchdata - # torchtext # torchvision # transformers # wandb @@ -340,7 +338,6 @@ torch==2.1.0 # pytorch-warmup # torchdata # torchmetrics - # torchtext # torchvision torchdata==0.7.0 # via torchtext @@ -348,8 +345,6 @@ torchmetrics==1.3.1 # via # lightning # pytorch-lightning -torchtext==0.16.0 - # via -r requirements.in torchvision==0.16.0 # via -r requirements.in tqdm==4.66.1 @@ -359,7 +354,6 @@ tqdm==4.66.1 # lightning # pyro-ppl # pytorch-lightning - # torchtext # transformers traitlets==5.14.1 # via lightning From b900c0c4356f02b9bb75762fd29f3ca9078854f8 Mon Sep 17 00:00:00 2001 From: Samuel Stanton Date: Wed, 7 Aug 2024 15:29:59 -0400 Subject: [PATCH 2/3] remove torchtext dep --- cortex/transforms/functional/__init__.py | 2 ++ cortex/transforms/functional/_to_tensor.py | 31 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 cortex/transforms/functional/_to_tensor.py diff --git a/cortex/transforms/functional/__init__.py b/cortex/transforms/functional/__init__.py index c97c173..dcc1ffc 100644 --- a/cortex/transforms/functional/__init__.py +++ b/cortex/transforms/functional/__init__.py @@ -1,5 +1,7 @@ +from ._to_tensor import to_tensor from ._tokenize_igg_ag_df import tokenize_igg_ag_df __all__ = [ + "to_tensor", "tokenize_igg_ag_df", ] diff --git a/cortex/transforms/functional/_to_tensor.py b/cortex/transforms/functional/_to_tensor.py new file mode 100644 index 0000000..62b8ff0 --- /dev/null +++ b/cortex/transforms/functional/_to_tensor.py @@ -0,0 +1,31 @@ +from typing import Any, Optional + +import torch +from torch import Tensor +from torch.nn.utils.rnn import pad_sequence + + +def to_tensor(input: Any, padding_value: Optional[int] = None, dtype: torch.dtype = torch.long) -> Tensor: + r"""Convert input to torch tensor + + :param padding_value: Pad value to make each input in the batch of length equal to the longest sequence in the batch. + :type padding_value: Optional[int] + :param dtype: :class:`torch.dtype` of output tensor + :type dtype: :class:`torch.dtype` + :param input: Sequence or batch of token ids + :type input: Union[list[int], list[list[int]]] + :rtype: Tensor + """ + if torch.jit.isinstance(input, list[int]): + return torch.tensor(input, dtype=torch.long) + elif torch.jit.isinstance(input, list[list[int]]): + if padding_value is None: + output = torch.tensor(input, dtype=dtype) + return output + else: + output = pad_sequence( + [torch.tensor(ids, dtype=dtype) for ids in input], batch_first=True, padding_value=float(padding_value) + ) + return output + else: + raise TypeError("Input type not supported") From e183ba7f08aecee462ad124189b5a23dfc936f9a Mon Sep 17 00:00:00 2001 From: Samuel Stanton Date: Wed, 7 Aug 2024 15:30:41 -0400 Subject: [PATCH 3/3] remove torchtext dep --- cortex/transforms/_to_tensor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cortex/transforms/_to_tensor.py b/cortex/transforms/_to_tensor.py index fda1578..dabea94 100644 --- a/cortex/transforms/_to_tensor.py +++ b/cortex/transforms/_to_tensor.py @@ -3,10 +3,11 @@ from typing import Any, Optional import torch -import torch.nn.functional as F from torch import Tensor from torch.nn import Module +from cortex.transforms.functional import to_tensor + class ToTensor(Module): r"""Convert input to torch tensor @@ -28,4 +29,4 @@ def forward(self, input: Any) -> Tensor: :type input: Union[List[int], List[List[int]]] :rtype: Tensor """ - return F.to_tensor(input, padding_value=self.padding_value, dtype=self.dtype) + return to_tensor(input, padding_value=self.padding_value, dtype=self.dtype)