Source code for pytorch_forecasting.models.timexer._timexer_v2

"""
Time Series Transformer with eXogenous variables (TimeXer)
----------------------------------------------------------
"""

################################################################
# NOTE: This implementation of TimeXer derives from PR #1797.  #
# It is experimental and seeks to clarify design decisions.    #
# IT IS STRICTLY A PART OF THE v2 design of PTF. It overrides  #
# the v1 version introduced in PTF by PR #1797                  #
################################################################

from typing import Any, Optional, Union
import warnings as warn

import torch
import torch.nn as nn
from torch.optim import Optimizer

from pytorch_forecasting.models.base._tslib_base_model_v2 import TslibBaseModel



[docs]
class TimeXer(TslibBaseModel):
    """
    An implementation of TimeXer model for v2 of pytorch-forecasting.

    TimeXer empowers the canonical transformer with the ability to reconcile
    endogenous and exogenous information without any architectural modifications
    and achieves consistent state-of-the-art performance across twelve real-world
    forecasting benchmarks.

    TimeXer employs patch-level and variate-level representations respectively for
    endogenous and exogenous variables, with an endogenous global token as a bridge
    in-between. With this design, TimeXer can jointly capture intra-endogenous
    temporal dependencies and exogenous-to-endogenous correlations.

    Parameters
    ----------
    loss: nn.Module
        Loss function to use for training.
    enc_in: int, optional
        Number of input features for the encoder. If not provided, it will be set to
        the number of continuous features in the dataset.
    hidden_size: int, default=512
        Dimension of the model embeddings and hidden representations of features.
    n_heads: int, default=8
        Number of attention heads in the multi-head attention mechanism.\
    e_layers: int, default=2
        Number of encoder layers in the transformer architecture.
    d_ff: int, default=2048
        Dimension of the feed-forward network in the transformer architecture.
    dropout: float, default=0.1
        Dropout rate for regularization. This is used throughout the model to prevent overfitting.
    patch_length: int, default=24
        Length of each non-overlapping patch for endogenous variable tokenization.
    factor: int, default=5
        Factor for the attention mechanism, controlling the number of keys and values.
    activation: str, default='relu'
        Activation function to use in the feed-forward network. Common choices are 'relu', 'gelu', etc.
    use_efficient_attention: bool, default=False
        If set to True, will use PyTorch's native, optimized Scaled Dot Product
        Attention implementation which can reduce computation time and memory
        consumption for longer sequences. PyTorch automatically selects the
        optimal backend (FlashAttention-2, Memory-Efficient Attention, or their
        own C++ implementation) based on user's input properties, hardware
        capabilities, and build configuration.
    logging_metrics: Optional[list[nn.Module]], default=None
        List of metrics to log during training, validation, and testing.
    optimizer: Optional[Union[Optimizer, str]], default='adam'
        Optimizer to use for training. Can be a string name or an instance of an optimizer.
    optimizer_params: Optional[dict], default=None
        Parameters for the optimizer. If None, default parameters for the optimizer will be used.
    lr_scheduler: Optional[str], default=None
        Learning rate scheduler to use. If None, no scheduler is used.
    lr_scheduler_params: Optional[dict], default=None
        Parameters for the learning rate scheduler. If None, default parameters for the scheduler will be used.
    metadata: Optional[dict], default=None
        Metadata for the model from TslibDataModule. This can include information about the dataset,
        such as the number of time steps, number of features, etc. It is used to initialize the model
        and ensure it is compatible with the data being used, including the split between endogenous
        (target) and exogenous covariates.

    References
    ----------
    [1] https://arxiv.org/abs/2402.19072
    [2] https://github.com/thuml/TimeXer

    Notes
    -----
    [1] This implementation handles only continuous variables in the context length. Categorical variables
        support will be added in the future.
    [2] The `TimeXer` model obtains many of its attributes from the `TslibBaseModel` class, which is a base class
        where a lot of the boiler plate code for metadata handling and model initialization is implemented.
    """  # noqa: E501

    @classmethod
    def _pkg(cls):
        """Package containing the model."""
        from pytorch_forecasting.models.timexer._timexer_pkg_v2 import TimeXer_pkg_v2

        return TimeXer_pkg_v2


[docs]
    def __init__(
        self,
        loss: nn.Module,
        enc_in: int = None,
        hidden_size: int = 512,
        n_heads: int = 8,
        e_layers: int = 2,
        d_ff: int = 2048,
        dropout: float = 0.1,
        patch_length: int = 4,
        factor: int = 5,
        activation: str = "relu",
        use_efficient_attention: bool = False,
        logging_metrics: list[nn.Module] | None = None,
        optimizer: Optimizer | str | None = "adam",
        optimizer_params: dict | None = None,
        lr_scheduler: str | None = None,
        lr_scheduler_params: dict | None = None,
        metadata: dict | None = None,
        **kwargs: Any,
    ):
        super().__init__(
            loss=loss,
            logging_metrics=logging_metrics,
            optimizer=optimizer,
            optimizer_params=optimizer_params,
            lr_scheduler=lr_scheduler,
            lr_scheduler_params=lr_scheduler_params,
            metadata=metadata,
        )

        warn.warn(
            "TimeXer is an experimental model implemented on TslibBaseModelV2. "
            "It is an unstable version and maybe subject to unannouced changes."
            "Please use with caution. Feedback on the design and implementation is"
            ""
            "welcome. On the issue #1833 - https://github.com/sktime/pytorch-forecasting/issues/1833",
        )

        self.enc_in = enc_in
        self.hidden_size = hidden_size
        self.n_heads = n_heads
        self.e_layers = e_layers
        self.d_ff = d_ff
        self.dropout = dropout
        self.patch_length = patch_length
        self.activation = activation
        self.use_efficient_attention = use_efficient_attention
        self.factor = factor
        self.save_hyperparameters(ignore=["loss", "logging_metrics", "metadata"])

        self._init_network()


    def _init_network(self):
        """
        Initialize the network for TimeXer's architecture.
        """

        from pytorch_forecasting.layers import (
            AttentionLayer,
            DataEmbedding_inverted,
            Encoder,
            EncoderLayer,
            EnEmbedding,
            FlattenHead,
            FullAttention,
        )

        if self.context_length <= self.patch_length:
            raise ValueError(
                f"Context length ({self.context_length}) must be greater than patch"
                "length. Patches of ({self.patch_length}) will end up being longer than"
                "the sequence length."
            )

        if self.context_length % self.patch_length != 0:
            warn.warn(
                f"Context length ({self.context_length}) is not divisible by"
                " patch length. This may lead to unexpected behavior, as some"
                "time steps will not be used in the model."
            )

        self.patch_num = max(1, int(self.context_length // self.patch_length))

        if self.target_dim > 1 and self.features == "M":
            self.n_target_vars = self.target_dim
        else:
            self.n_target_vars = 1

        # currently enc_in is set only to cont_dim since
        # the data module doesn't fully support categorical
        # variables in the context length and modele expects
        # float values.
        self.enc_in = self.enc_in or self.cont_dim

        self.n_quantiles = None

        if hasattr(self.loss, "quantiles") and self.loss.quantiles is not None:
            self.n_quantiles = len(self.loss.quantiles)

        if self.hidden_size % self.n_heads != 0:
            raise ValueError(
                f"hidden_size ({self.hidden_size}) must be divisible by n_heads ({self.n_heads}) "  # noqa: E501
                f"for multi-head attention mechanism to work properly."
            )

        self.en_embedding = EnEmbedding(
            self.n_target_vars, self.hidden_size, self.patch_length, self.dropout
        )

        self.ex_embedding = DataEmbedding_inverted(
            self.context_length, self.hidden_size, self.dropout
        )

        encoder_layers = []

        for _ in range(self.e_layers):
            encoder_layers.append(
                EncoderLayer(
                    AttentionLayer(
                        FullAttention(
                            False,
                            self.factor,
                            attention_dropout=self.dropout,
                            output_attention=False,
                            use_efficient_attention=self.use_efficient_attention,
                        ),
                        self.hidden_size,
                        self.n_heads,
                    ),
                    AttentionLayer(
                        FullAttention(
                            False,
                            self.factor,
                            attention_dropout=self.dropout,
                            output_attention=False,
                            use_efficient_attention=self.use_efficient_attention,
                        ),
                        self.hidden_size,
                        self.n_heads,
                    ),
                    self.hidden_size,
                    self.d_ff,
                    dropout=self.dropout,
                    activation=self.activation,
                )
            )

        self.encoder = Encoder(
            encoder_layers, norm_layer=torch.nn.LayerNorm(self.hidden_size)
        )

        # Initialize output head
        self.head_nf = self.hidden_size * (self.patch_num + 1)
        self.head = FlattenHead(
            self.enc_in,
            self.head_nf,
            self.prediction_length,
            head_dropout=self.dropout,
            n_quantiles=self.n_quantiles,
        )

    def _forecast(self, x: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        """
        Forward pass of the TimeXer model.

        Parameters
        ----------
        x : dict[str, torch.Tensor]
            Input data.

        Returns
        -------
        dict[str, torch.Tensor]
            Model predictions.
        """
        batch_size = x["history_cont"].shape[0]
        history_cont = x["history_cont"]
        history_time_idx = x.get("history_time_idx", None)

        history_target = x.get(
            "history_target",
            torch.zeros(batch_size, self.context_length, 1, device=self.device),
        )  # noqa: E501

        if history_time_idx is not None and history_time_idx.dim() == 2:
            # change [batch_size, time_steps] to [batch_size, time_steps, features]
            history_time_idx = history_time_idx.unsqueeze(-1)

        # v2 convention:
        # - endogenous information comes from the target history
        # - exogenous information comes from all continuous covariates
        endogenous_cont = history_target
        exogenous_cont = history_cont

        en_embed, n_vars = self.en_embedding(endogenous_cont)
        ex_embed = self.ex_embedding(exogenous_cont, history_time_idx)

        enc_out = self.encoder(en_embed, ex_embed)

        enc_out = torch.reshape(
            enc_out, (-1, n_vars, enc_out.shape[-2], enc_out.shape[-1])
        )

        enc_out = enc_out.permute(0, 1, 3, 2)

        dec_out = self.head(enc_out)

        return dec_out

    def forward(self, x: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
        """
        Forward pass of the TimeXer model.

        Parameters
        ----------
        x : dict[str, torch.Tensor]
            Input data.

        Returns
        -------
        dict[str, torch.Tensor]
            Model predictions.
        """

        out = self._forecast(x)
        prediction = out[:, : self.prediction_length, :]

        if "target_scale" in x:
            prediction = self.transform_output(prediction, x["target_scale"])

        return {"prediction": prediction}