huggingface · Cyrilvallez · Apr 16, 2025 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1059,6 +1059,8 @@
         title: PatchTST
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
+      - local: model_doc/timesfm
+        title: TimesFM
       title: Time series models
     - sections:
       - local: model_doc/graphormer

diff --git a/docs/source/en/model_doc/timesfm.md b/docs/source/en/model_doc/timesfm.md
@@ -0,0 +1,88 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TimesFM
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+TimesFM (Time Series Foundation Model) is a pretrained time-series foundation model proposed in [A decoder-only foundation model for time-series forecasting](https://huggingface.co/papers/2310.10688) by Abhimanyu Das, Weihao Kong, Rajat Sen, and  Yichen Zhou. It is a decoder only model that uses non-overlapping patches of time-series data as input and outputs some output patch length prediction in an autoregressive fashion.
+
+
+The abstract from the paper is the following:
+
+*Motivated by recent advances in large language models for Natural Language Processing (NLP), we design a time-series foundation model for forecasting whose out-of-the-box zero-shot performance on a variety of public datasets comes close to the accuracy of state-of-the-art supervised forecasting models for each individual dataset. Our model is based on pretraining a patched-decoder style attention model on a large time-series corpus, and can work well across different forecasting history lengths, prediction lengths and temporal granularities.*
+
+
+This model was contributed by [kashif](https://huggingface.co/kashif).
+The original code can be found [here](https://github.com/google-research/timesfm).
+
+
+To use the model:
+
+```python
+import torch
+from transformers import TimesFmModelForPrediction
+
+
+model = TimesFmModelForPrediction.from_pretrained(
+    "google/timesfm-2.0-500m-pytorch",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="sdpa",
+    device_map="cuda" if torch.cuda.is_available() else None
+)
+
+
+ # Create dummy inputs
+forecast_input = [
+    np.sin(np.linspace(0, 20, 100)),
+    np.sin(np.linspace(0, 20, 200)),
+    np.sin(np.linspace(0, 20, 400)),
+]
+frequency_input = [0, 1, 2]
+
+# Convert inputs to sequence of tensors
+forecast_input_tensor = [
+    torch.tensor(ts, dtype=torch.bfloat16).to("cuda" if torch.cuda.is_available() else "cpu")
+    for ts in forecast_input
+]
+frequency_input_tensor = torch.tensor(frequency_input, dtype=torch.long).to(
+    "cuda" if torch.cuda.is_available() else "cpu"
+)
+
+# Get predictions from the pre-trained model
+with torch.no_grad():
+    outputs = model(past_values=forecast_input_tensor, freq=frequency_input_tensor, return_dict=True)
+    point_forecast_conv = outputs.mean_predictions.float().cpu().numpy()
+    quantile_forecast_conv = outputs.full_predictions.float().cpu().numpy()
+```
+
+## TimesFmConfig
+
+[[autodoc]] TimesFmConfig
+
+## TimesFmModel
+
+[[autodoc]] TimesFmModel
+    - forward
+
+## TimesFmModelForPrediction
+
+[[autodoc]] TimesFmModelForPrediction
+    - forward
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
@@ -279,6 +279,7 @@
     from .tapas import *
     from .textnet import *
     from .time_series_transformer import *
+    from .timesfm import *
     from .timesformer import *
     from .timm_backbone import *
     from .timm_wrapper import *

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -313,6 +313,7 @@
         ("tapas", "TapasConfig"),
         ("textnet", "TextNetConfig"),
         ("time_series_transformer", "TimeSeriesTransformerConfig"),
+        ("timesfm", "TimesFmConfig"),
         ("timesformer", "TimesformerConfig"),
         ("timm_backbone", "TimmBackboneConfig"),
         ("timm_wrapper", "TimmWrapperConfig"),
@@ -681,6 +682,7 @@
         ("tapex", "TAPEX"),
         ("textnet", "TextNet"),
         ("time_series_transformer", "Time Series Transformer"),
+        ("timesfm", "TimesFm"),
         ("timesformer", "TimeSformer"),
         ("timm_backbone", "TimmBackbone"),
         ("timm_wrapper", "TimmWrapperModel"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -281,6 +281,7 @@
         ("tapas", "TapasModel"),
         ("textnet", "TextNetModel"),
         ("time_series_transformer", "TimeSeriesTransformerModel"),
+        ("timesfm", "TimesFmModel"),
         ("timesformer", "TimesformerModel"),
         ("timm_backbone", "TimmBackbone"),
         ("timm_wrapper", "TimmWrapperModel"),
@@ -1542,6 +1543,12 @@
     ]
 )
 
+MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = OrderedDict(
+    [
+        ("timesfm", "TimesFmModelForPrediction"),
+    ]
+)
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict(
     [
         ("swin2sr", "Swin2SRForImageSuperResolution"),
@@ -1650,6 +1657,10 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES
 )
 
+MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES
+)
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
 
 
@@ -1820,6 +1831,15 @@ class AutoModelForSemanticSegmentation(_BaseAutoModelClass):
 )
 
 
+class AutoModelForTimeSeriesPrediction(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING
+
+
+AutoModelForTimeSeriesPrediction = auto_class_update(
+    AutoModelForTimeSeriesPrediction, head_doc="time-series prediction"
+)
+
+
 class AutoModelForUniversalSegmentation(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING
 
@@ -1994,6 +2014,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
     "MODEL_FOR_TEXT_ENCODING_MAPPING",
     "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING",
     "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING",
+    "MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING",
     "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
     "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
     "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",

diff --git a/src/transformers/models/timesfm/__init__.py b/src/transformers/models/timesfm/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_timesfm import *
+    from .modeling_timesfm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/src/transformers/models/timesfm/configuration_timesfm.py b/src/transformers/models/timesfm/configuration_timesfm.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2025 Google LLC and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TimesFM model configuration"""
+
+from typing import List
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TimesFmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TimesFmModelForPrediction`] or a [`TFTimesFmModel`]. It is used to
+    instantiate a TimesFM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the TimesFM
+    [google/timesfm-2.0-500m-pytorch](https://huggingface.co/google/timesfm-2.0-500m-pytorch) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        patch_length (`int`, *optional*, defaults to 32):
+            The length of one patch in the input sequence.
+        context_length (`int`, *optional*, defaults to 512):
+            The length of the input context.
+        horizon_length (`int`, *optional*, defaults to 128):
+            The length of the prediction horizon.
+        freq_size (`int`, *optional*, defaults to 3):
+            The number of frequency embeddings.
+        num_hidden_layers (`int`, *optional*, defaults to 50):
+            Number of Transformer layers.
+        hidden_size (`int`, *optional*, defaults to 1280):
+            Size of the hidden layers in the feed-forward networks.
+        intermediate_size (`int`, *optional*, defaults to 1280):
+            Dimension of the MLP representations.
+        head_dim (`int`, *optional*, defaults to 80):
+            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
+            be defined as `num_attention_heads * head_dim`.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        tolerance (`float`, *optional*, defaults to 1e-06):
+            The tolerance for the quantile loss.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the RMS normalization layers.
+        quantiles (`List[float]`, *optional*, defaults to `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]`):
+            The quantiles to predict.
+        pad_val (`float`, *optional*, defaults to 1123581321.0):
+            The value used to pad the predictions.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention scores.
+        use_positional_embedding (`bool`, *optional*, defaults to `False`):
+            Whether to add positional embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        min_timescale (`int`, *optional*, defaults to 1):
+            The start of the geometric positional index. Determines the periodicity of
+            the added signal.
+        max_timescale (`int`, *optional*, defaults to 10000):
+            The end of the geometric positional index. Determines the frequency of the
+            added signal.
+    """
+
+    model_type = "timesfm"
+    keys_to_ignore_at_inference = []
+    is_encoder_decoder = False
+
+    def __init__(
+        self,
+        patch_length: int = 32,
+        context_length: int = 512,
+        horizon_length: int = 128,
+        freq_size: int = 3,
+        num_hidden_layers: int = 50,
+        hidden_size: int = 1280,
+        intermediate_size: int = 1280,
+        head_dim: int = 80,
+        num_attention_heads: int = 16,
+        tolerance: float = 1e-6,
+        rms_norm_eps: float = 1e-6,
+        quantiles: List[float] = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
+        pad_val: float = 1123581321.0,
+        attention_dropout: float = 0.0,
+        use_positional_embedding: bool = False,
+        initializer_range: float = 0.02,
+        min_timescale: int = 1,
+        max_timescale: int = 10_000,
+        **kwargs,
+    ):
+        self.patch_length = patch_length
+        self.context_length = context_length
+        self.horizon_length = horizon_length
+        self.quantiles = quantiles
+        self.pad_val = pad_val
+        self.freq_size = freq_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.head_dim = head_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.tolerance = tolerance
+        self.rms_norm_eps = rms_norm_eps
+        self.attention_dropout = attention_dropout
+        self.use_positional_embedding = use_positional_embedding
+        self.initializer_range = initializer_range
+        self.min_timescale = min_timescale
+        self.max_timescale = max_timescale
+
+        super().__init__(
+            is_encoder_decoder=self.is_encoder_decoder,
+            **kwargs,
+        )
+
+
+__all__ = ["TimesFmConfig"]