Source code for pytorch_forecasting.data.examples
"""
Example datasets for tutorials and testing.
"""
from pathlib import Path
import numpy as np
import pandas as pd
import requests
BASE_URL = "https://github.com/jdb78/pytorch-forecasting/raw/master/examples/data/"
DATA_PATH = Path(__file__).parent
[docs]def _get_data_by_filename(fname: str) -> Path:
"""
Download file or used cached version.
Args:
fname (str): name of file to download
Returns:
Path: path at which file lives
"""
full_fname = DATA_PATH.joinpath(fname)
# check if file exists - download if necessary
if not full_fname.exists():
url = BASE_URL + fname
download = requests.get(url, allow_redirects=True)
with open(full_fname, "wb") as file:
file.write(download.content)
return full_fname
[docs]def get_stallion_data() -> pd.DataFrame:
"""
Demand data with covariates.
~20k samples of 350 timeseries. Important columns
* Timeseries can be identified by ``agency`` and ``sku``.
* ``volume`` is the demand
* ``date`` is the month of the demand.
Returns:
pd.DataFrame: data
"""
fname = _get_data_by_filename("stallion.parquet")
return pd.read_parquet(fname)
[docs]def generate_ar_data(
n_series: int = 10,
timesteps: int = 400,
seasonality: float = 3.0,
trend: float = 3.0,
noise: float = 0.1,
level: float = 1.0,
exp: bool = False,
seed: int = 213,
) -> pd.DataFrame:
"""
Generate multivariate data without covariates.
Eeach timeseries is generated from seasonality and trend. Important columns:
* ``series``: series ID
* ``time_idx``: time index
* ``value``: target value
Args:
n_series (int, optional): Number of series. Defaults to 10.
timesteps (int, optional): Number of timesteps. Defaults to 400.
seasonality (float, optional): Normalized frequency, i.e. frequency is ``seasonality / timesteps``.
Defaults to 3.0.
trend (float, optional): Trend multiplier (seasonality is multiplied with 1.0). Defaults to 3.0.
noise (float, optional): Level of gaussian noise. Defaults to 0.1.
level (float, optional): Level multiplier (level is a constant to be aded to timeseries). Defaults to 1.0.
exp (bool, optional): If to return exponential of timeseries values. Defaults to False.
seed (int, optional): Random seed. Defaults to 213.
Returns:
pd.DataFrame: data
"""
# sample parameters
np.random.seed(seed)
linear_trends = np.random.normal(size=n_series)[:, None] / timesteps
quadratic_trends = np.random.normal(size=n_series)[:, None] / timesteps**2
seasonalities = np.random.normal(size=n_series)[:, None]
levels = level * np.random.normal(size=n_series)[:, None]
# generate series
x = np.arange(timesteps)[None, :]
series = (x * linear_trends + x**2 * quadratic_trends) * trend + seasonalities * np.sin(
2 * np.pi * seasonality * x / timesteps
)
# add noise
series = levels * series * (1 + noise * np.random.normal(size=series.shape))
if exp:
series = np.exp(series)
# insert into dataframe
data = (
pd.DataFrame(series)
.stack()
.reset_index()
.rename(columns={"level_0": "series", "level_1": "time_idx", 0: "value"})
)
return data