forecaster.py (prophet-1.1) | : | forecaster.py (prophet-1.1.1) | ||
---|---|---|---|---|
skipping to change at line 13 | skipping to change at line 13 | |||
# This source code is licensed under the MIT license found in the | # This source code is licensed under the MIT license found in the | |||
# LICENSE file in the root directory of this source tree. | # LICENSE file in the root directory of this source tree. | |||
from __future__ import absolute_import, division, print_function | from __future__ import absolute_import, division, print_function | |||
import logging | import logging | |||
from collections import OrderedDict, defaultdict | from collections import OrderedDict, defaultdict | |||
from copy import deepcopy | from copy import deepcopy | |||
from datetime import timedelta, datetime | from datetime import timedelta, datetime | |||
from typing import Dict, List | ||||
import numpy as np | import numpy as np | |||
import pandas as pd | import pandas as pd | |||
from prophet.make_holidays import get_holiday_names, make_holidays_df | from prophet.make_holidays import get_holiday_names, make_holidays_df | |||
from prophet.models import StanBackendEnum | from prophet.models import StanBackendEnum | |||
from prophet.plot import (plot, plot_components) | from prophet.plot import (plot, plot_components) | |||
logger = logging.getLogger('prophet') | logger = logging.getLogger('prophet') | |||
logger.setLevel(logging.INFO) | logger.setLevel(logging.INFO) | |||
skipping to change at line 160 | skipping to change at line 161 | |||
else: | else: | |||
self.stan_backend = StanBackendEnum.get_backend_class(stan_backend)( ) | self.stan_backend = StanBackendEnum.get_backend_class(stan_backend)( ) | |||
logger.debug("Loaded stan backend: %s", self.stan_backend.get_type()) | logger.debug("Loaded stan backend: %s", self.stan_backend.get_type()) | |||
def validate_inputs(self): | def validate_inputs(self): | |||
"""Validates the inputs to Prophet.""" | """Validates the inputs to Prophet.""" | |||
if self.growth not in ('linear', 'logistic', 'flat'): | if self.growth not in ('linear', 'logistic', 'flat'): | |||
raise ValueError( | raise ValueError( | |||
'Parameter "growth" should be "linear", "logistic" or "flat".') | 'Parameter "growth" should be "linear", "logistic" or "flat".') | |||
if not isinstance(self.changepoint_range, (int, float)): | ||||
raise ValueError("changepoint_range must be a number in [0, 1]'") | ||||
if ((self.changepoint_range < 0) or (self.changepoint_range > 1)): | if ((self.changepoint_range < 0) or (self.changepoint_range > 1)): | |||
raise ValueError('Parameter "changepoint_range" must be in [0, 1]') | raise ValueError('Parameter "changepoint_range" must be in [0, 1]') | |||
if self.holidays is not None: | if self.holidays is not None: | |||
if not ( | if not ( | |||
isinstance(self.holidays, pd.DataFrame) | isinstance(self.holidays, pd.DataFrame) | |||
and 'ds' in self.holidays # noqa W503 | and 'ds' in self.holidays # noqa W503 | |||
and 'holiday' in self.holidays # noqa W503 | and 'holiday' in self.holidays # noqa W503 | |||
): | ): | |||
raise ValueError('holidays must be a DataFrame with "ds" and ' | raise ValueError('holidays must be a DataFrame with "ds" and ' | |||
'"holiday" columns.') | '"holiday" columns.') | |||
skipping to change at line 1176 | skipping to change at line 1179 | |||
self.params[par] = np.array([self.params[par]]) | self.params[par] = np.array([self.params[par]]) | |||
elif self.mcmc_samples > 0: | elif self.mcmc_samples > 0: | |||
self.params = self.stan_backend.sampling(stan_init, dat, self.mcmc_s amples, **kwargs) | self.params = self.stan_backend.sampling(stan_init, dat, self.mcmc_s amples, **kwargs) | |||
else: | else: | |||
self.params = self.stan_backend.fit(stan_init, dat, **kwargs) | self.params = self.stan_backend.fit(stan_init, dat, **kwargs) | |||
self.stan_fit = self.stan_backend.stan_fit | self.stan_fit = self.stan_backend.stan_fit | |||
# If no changepoints were requested, replace delta with 0s | # If no changepoints were requested, replace delta with 0s | |||
if len(self.changepoints) == 0: | if len(self.changepoints) == 0: | |||
# Fold delta into the base rate k | # Fold delta into the base rate k | |||
self.params['k'] = (self.params['k'] | self.params['k'] = ( | |||
+ self.params['delta'].reshape(-1)) | self.params['k'] + self.params['delta'].reshape(-1) | |||
) | ||||
self.params['delta'] = (np.zeros(self.params['delta'].shape) | self.params['delta'] = (np.zeros(self.params['delta'].shape) | |||
.reshape((-1, 1))) | .reshape((-1, 1))) | |||
return self | return self | |||
def predict(self, df=None): | def predict(self, df: pd.DataFrame = None, vectorized: bool = True) -> pd.Da taFrame: | |||
"""Predict using the prophet model. | """Predict using the prophet model. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
df: pd.DataFrame with dates for predictions (column ds), and capacity | df: pd.DataFrame with dates for predictions (column ds), and capacity | |||
(column cap) if logistic growth. If not provided, predictions are | (column cap) if logistic growth. If not provided, predictions are | |||
made on the history. | made on the history. | |||
vectorized: Whether to use a vectorized method to compute uncertainty in | ||||
tervals. Suggest using | ||||
True (the default) for much faster runtimes in most cases, | ||||
except when (growth = 'logistic' and mcmc_samples > 0). | ||||
Returns | Returns | |||
------- | ------- | |||
A pd.DataFrame with the forecast components. | A pd.DataFrame with the forecast components. | |||
""" | """ | |||
if self.history is None: | if self.history is None: | |||
raise Exception('Model has not been fit.') | raise Exception('Model has not been fit.') | |||
if df is None: | if df is None: | |||
df = self.history.copy() | df = self.history.copy() | |||
else: | else: | |||
if df.shape[0] == 0: | if df.shape[0] == 0: | |||
raise ValueError('Dataframe has no rows.') | raise ValueError('Dataframe has no rows.') | |||
df = self.setup_dataframe(df.copy()) | df = self.setup_dataframe(df.copy()) | |||
df['trend'] = self.predict_trend(df) | df['trend'] = self.predict_trend(df) | |||
seasonal_components = self.predict_seasonal_components(df) | seasonal_components = self.predict_seasonal_components(df) | |||
if self.uncertainty_samples: | if self.uncertainty_samples: | |||
intervals = self.predict_uncertainty(df) | intervals = self.predict_uncertainty(df, vectorized) | |||
else: | else: | |||
intervals = None | intervals = None | |||
# Drop columns except ds, cap, floor, and trend | # Drop columns except ds, cap, floor, and trend | |||
cols = ['ds', 'trend'] | cols = ['ds', 'trend'] | |||
if 'cap' in df: | if 'cap' in df: | |||
cols.append('cap') | cols.append('cap') | |||
if self.logistic_floor: | if self.logistic_floor: | |||
cols.append('floor') | cols.append('floor') | |||
# Add in forecast components | # Add in forecast components | |||
skipping to change at line 1243 | skipping to change at line 1250 | |||
t: np.array of times on which the function is evaluated. | t: np.array of times on which the function is evaluated. | |||
deltas: np.array of rate changes at each changepoint. | deltas: np.array of rate changes at each changepoint. | |||
k: Float initial rate. | k: Float initial rate. | |||
m: Float initial offset. | m: Float initial offset. | |||
changepoint_ts: np.array of changepoint times. | changepoint_ts: np.array of changepoint times. | |||
Returns | Returns | |||
------- | ------- | |||
Vector y(t). | Vector y(t). | |||
""" | """ | |||
# Intercept changes | deltas_t = (changepoint_ts[None, :] <= t[..., None]) * deltas | |||
gammas = -changepoint_ts * deltas | k_t = deltas_t.sum(axis=1) + k | |||
# Get cumulative slope and intercept at each t | m_t = (deltas_t * -changepoint_ts).sum(axis=1) + m | |||
k_t = k * np.ones_like(t) | ||||
m_t = m * np.ones_like(t) | ||||
for s, t_s in enumerate(changepoint_ts): | ||||
indx = t >= t_s | ||||
k_t[indx] += deltas[s] | ||||
m_t[indx] += gammas[s] | ||||
return k_t * t + m_t | return k_t * t + m_t | |||
@staticmethod | @staticmethod | |||
def piecewise_logistic(t, cap, deltas, k, m, changepoint_ts): | def piecewise_logistic(t, cap, deltas, k, m, changepoint_ts): | |||
"""Evaluate the piecewise logistic function. | """Evaluate the piecewise logistic function. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
t: np.array of times on which the function is evaluated. | t: np.array of times on which the function is evaluated. | |||
cap: np.array of capacities at each t. | cap: np.array of capacities at each t. | |||
skipping to change at line 1368 | skipping to change at line 1369 | |||
data[component] = np.nanmean(comp, axis=1) | data[component] = np.nanmean(comp, axis=1) | |||
if self.uncertainty_samples: | if self.uncertainty_samples: | |||
data[component + '_lower'] = self.percentile( | data[component + '_lower'] = self.percentile( | |||
comp, lower_p, axis=1, | comp, lower_p, axis=1, | |||
) | ) | |||
data[component + '_upper'] = self.percentile( | data[component + '_upper'] = self.percentile( | |||
comp, upper_p, axis=1, | comp, upper_p, axis=1, | |||
) | ) | |||
return pd.DataFrame(data) | return pd.DataFrame(data) | |||
def sample_posterior_predictive(self, df): | def predict_uncertainty(self, df: pd.DataFrame, vectorized: bool) -> pd.Data | |||
Frame: | ||||
"""Prediction intervals for yhat and trend. | ||||
Parameters | ||||
---------- | ||||
df: Prediction dataframe. | ||||
vectorized: Whether to use a vectorized method for generating future dra | ||||
ws. | ||||
Returns | ||||
------- | ||||
Dataframe with uncertainty intervals. | ||||
""" | ||||
sim_values = self.sample_posterior_predictive(df, vectorized) | ||||
lower_p = 100 * (1.0 - self.interval_width) / 2 | ||||
upper_p = 100 * (1.0 + self.interval_width) / 2 | ||||
series = {} | ||||
for key in ['yhat', 'trend']: | ||||
series['{}_lower'.format(key)] = self.percentile( | ||||
sim_values[key], lower_p, axis=1) | ||||
series['{}_upper'.format(key)] = self.percentile( | ||||
sim_values[key], upper_p, axis=1) | ||||
return pd.DataFrame(series) | ||||
def sample_posterior_predictive(self, df: pd.DataFrame, vectorized: bool) -> | ||||
Dict[str, np.ndarray]: | ||||
"""Prophet posterior predictive samples. | """Prophet posterior predictive samples. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
df: Prediction dataframe. | df: Prediction dataframe. | |||
vectorized: Whether to use a vectorized method to generate future draws. | ||||
Returns | Returns | |||
------- | ------- | |||
Dictionary with posterior predictive samples for the forecast yhat and | Dictionary with posterior predictive samples for the forecast yhat and | |||
for the trend component. | for the trend component. | |||
""" | """ | |||
n_iterations = self.params['k'].shape[0] | n_iterations = self.params['k'].shape[0] | |||
samp_per_iter = max(1, int(np.ceil( | samp_per_iter = max(1, int(np.ceil( | |||
self.uncertainty_samples / float(n_iterations) | self.uncertainty_samples / float(n_iterations) | |||
))) | ))) | |||
# Generate seasonality features once so we can re-use them. | # Generate seasonality features once so we can re-use them. | |||
seasonal_features, _, component_cols, _ = ( | seasonal_features, _, component_cols, _ = ( | |||
self.make_all_seasonality_features(df) | self.make_all_seasonality_features(df) | |||
) | ) | |||
sim_values = {'yhat': [], 'trend': []} | sim_values = {'yhat': [], 'trend': []} | |||
for i in range(n_iterations): | for i in range(n_iterations): | |||
for _j in range(samp_per_iter): | if vectorized: | |||
sim = self.sample_model( | sims = self.sample_model_vectorized( | |||
df=df, | df=df, | |||
seasonal_features=seasonal_features, | seasonal_features=seasonal_features, | |||
iteration=i, | iteration=i, | |||
s_a=component_cols['additive_terms'], | s_a=component_cols['additive_terms'], | |||
s_m=component_cols['multiplicative_terms'], | s_m=component_cols['multiplicative_terms'], | |||
n_samples=samp_per_iter | ||||
) | ) | |||
for key in sim_values: | else: | |||
sims = [ | ||||
self.sample_model( | ||||
df=df, | ||||
seasonal_features=seasonal_features, | ||||
iteration=i, | ||||
s_a=component_cols['additive_terms'], | ||||
s_m=component_cols['multiplicative_terms'], | ||||
) for _ in range(samp_per_iter) | ||||
] | ||||
for key in sim_values: | ||||
for sim in sims: | ||||
sim_values[key].append(sim[key]) | sim_values[key].append(sim[key]) | |||
for k, v in sim_values.items(): | for k, v in sim_values.items(): | |||
sim_values[k] = np.column_stack(v) | sim_values[k] = np.column_stack(v) | |||
return sim_values | return sim_values | |||
def predictive_samples(self, df): | ||||
"""Sample from the posterior predictive distribution. Returns samples | ||||
for the main estimate yhat, and for the trend component. The shape of | ||||
each output will be (nforecast x nsamples), where nforecast is the | ||||
number of points being forecasted (the number of rows in the input | ||||
dataframe) and nsamples is the number of posterior samples drawn. | ||||
This is the argument `uncertainty_samples` in the Prophet constructor, | ||||
which defaults to 1000. | ||||
Parameters | ||||
---------- | ||||
df: Dataframe with dates for predictions (column ds), and capacity | ||||
(column cap) if logistic growth. | ||||
Returns | ||||
------- | ||||
Dictionary with keys "trend" and "yhat" containing | ||||
posterior predictive samples for that component. | ||||
""" | ||||
df = self.setup_dataframe(df.copy()) | ||||
sim_values = self.sample_posterior_predictive(df) | ||||
return sim_values | ||||
def predict_uncertainty(self, df): | ||||
"""Prediction intervals for yhat and trend. | ||||
Parameters | ||||
---------- | ||||
df: Prediction dataframe. | ||||
Returns | ||||
------- | ||||
Dataframe with uncertainty intervals. | ||||
""" | ||||
sim_values = self.sample_posterior_predictive(df) | ||||
lower_p = 100 * (1.0 - self.interval_width) / 2 | ||||
upper_p = 100 * (1.0 + self.interval_width) / 2 | ||||
series = {} | ||||
for key in ['yhat', 'trend']: | ||||
series['{}_lower'.format(key)] = self.percentile( | ||||
sim_values[key], lower_p, axis=1) | ||||
series['{}_upper'.format(key)] = self.percentile( | ||||
sim_values[key], upper_p, axis=1) | ||||
return pd.DataFrame(series) | ||||
def sample_model(self, df, seasonal_features, iteration, s_a, s_m): | def sample_model(self, df, seasonal_features, iteration, s_a, s_m): | |||
"""Simulate observations from the extrapolated generative model. | """Simulate observations from the extrapolated generative model. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
df: Prediction dataframe. | df: Prediction dataframe. | |||
seasonal_features: pd.DataFrame of seasonal features. | seasonal_features: pd.DataFrame of seasonal features. | |||
iteration: Int sampling iteration to use parameters from. | iteration: Int sampling iteration to use parameters from. | |||
s_a: Indicator vector for additive components | s_a: Indicator vector for additive components | |||
s_m: Indicator vector for multiplicative components | s_m: Indicator vector for multiplicative components | |||
skipping to change at line 1484 | skipping to change at line 1474 | |||
Xb_m = np.matmul(seasonal_features.values, beta * s_m.values) | Xb_m = np.matmul(seasonal_features.values, beta * s_m.values) | |||
sigma = self.params['sigma_obs'][iteration] | sigma = self.params['sigma_obs'][iteration] | |||
noise = np.random.normal(0, sigma, df.shape[0]) * self.y_scale | noise = np.random.normal(0, sigma, df.shape[0]) * self.y_scale | |||
return pd.DataFrame({ | return pd.DataFrame({ | |||
'yhat': trend * (1 + Xb_m) + Xb_a + noise, | 'yhat': trend * (1 + Xb_m) + Xb_a + noise, | |||
'trend': trend | 'trend': trend | |||
}) | }) | |||
def sample_model_vectorized( | ||||
self, | ||||
df: pd.DataFrame, | ||||
seasonal_features: pd.DataFrame, | ||||
iteration: int, | ||||
s_a: np.ndarray, | ||||
s_m: np.ndarray, | ||||
n_samples: int, | ||||
) -> List[pd.DataFrame]: | ||||
"""Simulate observations from the extrapolated generative model. Vectori | ||||
zed version of sample_model(). | ||||
Returns | ||||
------- | ||||
List (length n_samples) of DataFrames with np.arrays for trend and yhat, | ||||
each ordered like df['t']. | ||||
""" | ||||
# Get the seasonality and regressor components, which are deterministic | ||||
per iteration | ||||
beta = self.params['beta'][iteration] | ||||
Xb_a = np.matmul(seasonal_features.values, | ||||
beta * s_a.values) * self.y_scale | ||||
Xb_m = np.matmul(seasonal_features.values, beta * s_m.values) | ||||
# Get the future trend, which is stochastic per iteration | ||||
trends = self.sample_predictive_trend_vectorized(df, n_samples, iteratio | ||||
n) # already on the same scale as the actual data | ||||
sigma = self.params['sigma_obs'][iteration] | ||||
noise_terms = np.random.normal(0, sigma, trends.shape) * self.y_scale | ||||
simulations = [] | ||||
for trend, noise in zip(trends, noise_terms): | ||||
simulations.append(pd.DataFrame({ | ||||
'yhat': trend * (1 + Xb_m) + Xb_a + noise, | ||||
'trend': trend | ||||
})) | ||||
return simulations | ||||
def sample_predictive_trend(self, df, iteration): | def sample_predictive_trend(self, df, iteration): | |||
"""Simulate the trend using the extrapolated generative model. | """Simulate the trend using the extrapolated generative model. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
df: Prediction dataframe. | df: Prediction dataframe. | |||
iteration: Int sampling iteration to use parameters from. | iteration: Int sampling iteration to use parameters from. | |||
Returns | Returns | |||
------- | ------- | |||
skipping to change at line 1537 | skipping to change at line 1560 | |||
trend = self.piecewise_linear(t, deltas, k, m, changepoint_ts) | trend = self.piecewise_linear(t, deltas, k, m, changepoint_ts) | |||
elif self.growth == 'logistic': | elif self.growth == 'logistic': | |||
cap = df['cap_scaled'] | cap = df['cap_scaled'] | |||
trend = self.piecewise_logistic(t, cap, deltas, k, m, | trend = self.piecewise_logistic(t, cap, deltas, k, m, | |||
changepoint_ts) | changepoint_ts) | |||
elif self.growth == 'flat': | elif self.growth == 'flat': | |||
trend = self.flat_trend(t, m) | trend = self.flat_trend(t, m) | |||
return trend * self.y_scale + df['floor'] | return trend * self.y_scale + df['floor'] | |||
def sample_predictive_trend_vectorized(self, df: pd.DataFrame, n_samples: in | ||||
t, iteration: int = 0) -> np.ndarray: | ||||
"""Sample draws of the future trend values. Vectorized version of sample | ||||
_predictive_trend(). | ||||
Returns | ||||
------- | ||||
Draws of the trend values with shape (n_samples, len(df)). Values are on | ||||
the scale of the original data. | ||||
""" | ||||
deltas = self.params["delta"][iteration] | ||||
m = self.params["m"][iteration] | ||||
k = self.params["k"][iteration] | ||||
if self.growth == "linear": | ||||
expected = self.piecewise_linear(df["t"].values, deltas, k, m, self. | ||||
changepoints_t) | ||||
elif self.growth == "logistic": | ||||
expected = self.piecewise_logistic( | ||||
df["t"].values, df["cap_scaled"].values, deltas, k, m, self.chan | ||||
gepoints_t | ||||
) | ||||
elif self.growth == "flat": | ||||
expected = self.flat_trend(df["t"].values, m) | ||||
else: | ||||
raise NotImplementedError | ||||
uncertainty = self._sample_uncertainty(df, n_samples, iteration) | ||||
return ( | ||||
(np.tile(expected, (n_samples, 1)) + uncertainty) * self.y_scale + | ||||
np.tile(df["floor"].values, (n_samples, 1)) | ||||
) | ||||
def _sample_uncertainty(self, df: pd.DataFrame, n_samples: int, iteration: i | ||||
nt = 0) -> np.ndarray: | ||||
"""Sample draws of future trend changes, vectorizing as much as possible | ||||
. | ||||
Parameters | ||||
---------- | ||||
df: DataFrame with columns `t` (time scaled to the model context), trend | ||||
, and cap. | ||||
n_samples: Number of future paths of the trend to simulate | ||||
iteration: The iteration of the parameter set to use. Default 0, the fir | ||||
st iteration. | ||||
Returns | ||||
------- | ||||
Draws of the trend changes with shape (n_samples, len(df)). Values are s | ||||
tandardized. | ||||
""" | ||||
# handle only historic data | ||||
if df["t"].max() <= 1: | ||||
# there is no trend uncertainty in historic trends | ||||
uncertainties = np.zeros((n_samples, len(df))) | ||||
else: | ||||
future_df = df.loc[df["t"] > 1] | ||||
n_length = len(future_df) | ||||
# handle 1 length futures by using history | ||||
if n_length > 1: | ||||
single_diff = np.diff(future_df["t"]).mean() | ||||
else: | ||||
single_diff = np.diff(self.history["t"]).mean() | ||||
change_likelihood = len(self.changepoints_t) * single_diff | ||||
deltas = self.params["delta"][iteration] | ||||
m = self.params["m"][iteration] | ||||
k = self.params["k"][iteration] | ||||
mean_delta = np.mean(np.abs(deltas)) + 1e-8 | ||||
if self.growth == "linear": | ||||
mat = self._make_trend_shift_matrix(mean_delta, change_likelihoo | ||||
d, n_length, n_samples=n_samples) | ||||
uncertainties = mat.cumsum(axis=1).cumsum(axis=1) # from slope | ||||
changes to actual values | ||||
uncertainties *= single_diff # scaled by the actual meaning of | ||||
the slope | ||||
elif self.growth == "logistic": | ||||
mat = self._make_trend_shift_matrix(mean_delta, change_likelihoo | ||||
d, n_length, n_samples=n_samples) | ||||
uncertainties = self._logistic_uncertainty( | ||||
mat=mat, | ||||
deltas=deltas, | ||||
k=k, | ||||
m=m, | ||||
cap=future_df["cap_scaled"].values, | ||||
t_time=future_df["t"].values, | ||||
n_length=n_length, | ||||
single_diff=single_diff, | ||||
) | ||||
elif self.growth == "flat": | ||||
# no trend uncertainty when there is no growth | ||||
uncertainties = np.zeros((n_samples, n_length)) | ||||
else: | ||||
raise NotImplementedError | ||||
# handle past included in dataframe | ||||
if df["t"].min() <= 1: | ||||
past_uncertainty = np.zeros((n_samples, np.sum(df["t"] <= 1))) | ||||
uncertainties = np.concatenate([past_uncertainty, uncertainties] | ||||
, axis=1) | ||||
return uncertainties | ||||
@staticmethod | ||||
def _make_trend_shift_matrix( | ||||
mean_delta: float, likelihood: float, future_length: float, n_samples: i | ||||
nt | ||||
) -> np.ndarray: | ||||
""" | ||||
Creates a matrix of random trend shifts based on historical likelihood a | ||||
nd size of shifts. | ||||
Can be used for either linear or logistic trend shifts. | ||||
Each row represents a different sample of a possible future, and each co | ||||
lumn is a time step into the future. | ||||
""" | ||||
# create a bool matrix of where these trend shifts should go | ||||
bool_slope_change = np.random.uniform(size=(n_samples, future_length)) < | ||||
likelihood | ||||
shift_values = np.random.laplace(0, mean_delta, size=bool_slope_change.s | ||||
hape) | ||||
mat = shift_values * bool_slope_change | ||||
n_mat = np.hstack([np.zeros((len(mat), 1)), mat])[:, :-1] | ||||
mat = (n_mat + mat) / 2 | ||||
return mat | ||||
@staticmethod | ||||
def _make_historical_mat_time(deltas, changepoints_t, t_time, n_row=1, singl | ||||
e_diff=None): | ||||
""" | ||||
Creates a matrix of slope-deltas where these changes occured in training | ||||
data according to the trained prophet obj | ||||
""" | ||||
if single_diff is None: | ||||
single_diff = np.diff(t_time).mean() | ||||
prev_time = np.arange(0, 1 + single_diff, single_diff) | ||||
idxs = [] | ||||
for changepoint in changepoints_t: | ||||
idxs.append(np.where(prev_time > changepoint)[0][0]) | ||||
prev_deltas = np.zeros(len(prev_time)) | ||||
prev_deltas[idxs] = deltas | ||||
prev_deltas = np.repeat(prev_deltas.reshape(1, -1), n_row, axis=0) | ||||
return prev_deltas, prev_time | ||||
def _logistic_uncertainty( | ||||
self, | ||||
mat: np.ndarray, | ||||
deltas: np.ndarray, | ||||
k: float, | ||||
m: float, | ||||
cap: np.ndarray, | ||||
t_time: np.ndarray, | ||||
n_length: int, | ||||
single_diff: float = None, | ||||
) -> np.ndarray: | ||||
""" | ||||
Vectorizes prophet's logistic uncertainty by creating a matrix of future | ||||
possible trends. | ||||
Parameters | ||||
---------- | ||||
mat: A trend shift matrix returned by _make_trend_shift_matrix() | ||||
deltas: The size of the trend changes at each changepoint, estimated by | ||||
the model | ||||
k: Float initial rate. | ||||
m: Float initial offset. | ||||
cap: np.array of capacities at each t. | ||||
t_time: The values of t in the model context (i.e. scaled so that anythi | ||||
ng > 1 represents the future) | ||||
n_length: For each path, the number of future steps to simulate | ||||
single_diff: The difference between each t step in the model context. De | ||||
fault None, inferred | ||||
from t_time. | ||||
Returns | ||||
------- | ||||
A numpy array with shape (n_samples, n_length), representing the width o | ||||
f the uncertainty interval | ||||
(standardized, not on the same scale as the actual data values) around 0 | ||||
. | ||||
""" | ||||
def ffill(arr): | ||||
mask = arr == 0 | ||||
idx = np.where(~mask, np.arange(mask.shape[1]), 0) | ||||
np.maximum.accumulate(idx, axis=1, out=idx) | ||||
return arr[np.arange(idx.shape[0])[:, None], idx] | ||||
# for logistic growth we need to evaluate the trend all the way from the | ||||
start of the train item | ||||
historical_mat, historical_time = self._make_historical_mat_time(deltas, | ||||
self.changepoints_t, t_time, len(mat), single_diff) | ||||
mat = np.concatenate([historical_mat, mat], axis=1) | ||||
full_t_time = np.concatenate([historical_time, t_time]) | ||||
# apply logistic growth logic on the slope changes | ||||
k_cum = np.concatenate((np.ones((mat.shape[0], 1)) * k, np.where(mat, np | ||||
.cumsum(mat, axis=1) + k, 0)), axis=1) | ||||
k_cum_b = ffill(k_cum) | ||||
gammas = np.zeros_like(mat) | ||||
for i in range(mat.shape[1]): | ||||
x = full_t_time[i] - m - np.sum(gammas[:, :i], axis=1) | ||||
ks = 1 - k_cum_b[:, i] / k_cum_b[:, i + 1] | ||||
gammas[:, i] = x * ks | ||||
# the data before the -n_length is the historical values, which are not | ||||
needed, so cut the last n_length | ||||
k_t = (mat.cumsum(axis=1) + k)[:, -n_length:] | ||||
m_t = (gammas.cumsum(axis=1) + m)[:, -n_length:] | ||||
sample_trends = cap / (1 + np.exp(-k_t * (t_time - m_t))) | ||||
# remove the mean because we only need width of the uncertainty centered | ||||
around 0 | ||||
# we will add the width to the main forecast - yhat (which is the mean) | ||||
- later | ||||
return sample_trends - sample_trends.mean(axis=0) | ||||
def predictive_samples(self, df: pd.DataFrame, vectorized: bool = True): | ||||
"""Sample from the posterior predictive distribution. Returns samples | ||||
for the main estimate yhat, and for the trend component. The shape of | ||||
each output will be (nforecast x nsamples), where nforecast is the | ||||
number of points being forecasted (the number of rows in the input | ||||
dataframe) and nsamples is the number of posterior samples drawn. | ||||
This is the argument `uncertainty_samples` in the Prophet constructor, | ||||
which defaults to 1000. | ||||
Parameters | ||||
---------- | ||||
df: Dataframe with dates for predictions (column ds), and capacity | ||||
(column cap) if logistic growth. | ||||
vectorized: Whether to use a vectorized method to compute possible draws | ||||
. Suggest using | ||||
True (the default) for much faster runtimes in most cases, | ||||
except when (growth = 'logistic' and mcmc_samples > 0). | ||||
Returns | ||||
------- | ||||
Dictionary with keys "trend" and "yhat" containing | ||||
posterior predictive samples for that component. | ||||
""" | ||||
df = self.setup_dataframe(df.copy()) | ||||
return self.sample_posterior_predictive(df, vectorized) | ||||
def percentile(self, a, *args, **kwargs): | def percentile(self, a, *args, **kwargs): | |||
""" | """ | |||
We rely on np.nanpercentile in the rare instances where there | We rely on np.nanpercentile in the rare instances where there | |||
are a small number of bad samples with MCMC that contain NaNs. | are a small number of bad samples with MCMC that contain NaNs. | |||
However, since np.nanpercentile is far slower than np.percentile, | However, since np.nanpercentile is far slower than np.percentile, | |||
we only fall back to it if the array contains NaNs. See | we only fall back to it if the array contains NaNs. See | |||
https://github.com/facebook/prophet/issues/1310 for more details. | https://github.com/facebook/prophet/issues/1310 for more details. | |||
""" | """ | |||
fn = np.nanpercentile if np.isnan(a).any() else np.percentile | fn = np.nanpercentile if np.isnan(a).any() else np.percentile | |||
return fn(a, *args, **kwargs) | return fn(a, *args, **kwargs) | |||
def make_future_dataframe(self, periods, freq='D', include_history=True): | def make_future_dataframe(self, periods, freq='D', include_history=True): | |||
"""Simulate the trend using the extrapolated generative model. | """Simulate the trend using the extrapolated generative model. | |||
Parameters | Parameters | |||
---------- | ---------- | |||
periods: Int number of periods to forecast forward. | periods: Int number of periods to forecast forward. | |||
freq: Any valid frequency for pd.date_range, such as 'D' or 'M'. | freq: Any valid frequency for pd.date_range, such as 'D' or 'M'. | |||
include_history: Boolean to include the historical dates in the data | include_history: Boolean to include the historical dates in the data | |||
frame for predictions. | frame for predictions. | |||
Returns | Returns | |||
------- | ------- | |||
pd.Dataframe that extends forward from the end of self.history for the | pd.Dataframe that extends forward from the end of self.history for the | |||
requested number of periods. | requested number of periods. | |||
""" | """ | |||
if self.history_dates is None: | if self.history_dates is None: | |||
raise Exception('Model has not been fit.') | raise Exception('Model has not been fit.') | |||
if freq is None: | ||||
# taking the tail makes freq inference more reliable | ||||
freq = pd.infer_freq(self.history_dates.tail(5)) | ||||
# returns None if inference failed | ||||
if freq is None: | ||||
raise Exception('Unable to infer `freq`') | ||||
last_date = self.history_dates.max() | last_date = self.history_dates.max() | |||
dates = pd.date_range( | dates = pd.date_range( | |||
start=last_date, | start=last_date, | |||
periods=periods + 1, # An extra in case we include start | periods=periods + 1, # An extra in case we include start | |||
freq=freq) | freq=freq) | |||
dates = dates[dates > last_date] # Drop start if equals last_date | dates = dates[dates > last_date] # Drop start if equals last_date | |||
dates = dates[:periods] # Return correct number of periods | dates = dates[:periods] # Return correct number of periods | |||
if include_history: | if include_history: | |||
dates = np.concatenate((np.array(self.history_dates), dates)) | dates = np.concatenate((np.array(self.history_dates), dates)) | |||
End of changes. 19 change blocks. | ||||
68 lines changed or deleted | 340 lines changed or added |