Pipeline¶
In [85]:
Copied!
import os, sys, importlib
import os, sys, importlib
In [86]:
Copied!
# need this so we can import from different folders
sys.path.append("../Functions")
# need this so we can import from different folders
sys.path.append("../Functions")
In [87]:
Copied!
sys.path.append("../Models")
sys.path.append("../Models")
In [88]:
Copied!
from preprocessing_functions import *
from performance_functions import *
from aggregate_functions import *
from dampner import dampen, trend_adjuster
from preprocessing_functions import *
from performance_functions import *
from aggregate_functions import *
from dampner import dampen, trend_adjuster
In [89]:
Copied!
from SARIMAX import rolling_forecast_SARIMA
from simple_growth import rolling_forecast_LY_perc_inc
from lagged_average import rolling_forecast_lagged_average
from holt_winters import rolling_forecast_holt_winters
from ensemble import rolling_forecast_ensemble
from SARIMAX import rolling_forecast_SARIMA
from simple_growth import rolling_forecast_LY_perc_inc
from lagged_average import rolling_forecast_lagged_average
from holt_winters import rolling_forecast_holt_winters
from ensemble import rolling_forecast_ensemble
In [90]:
Copied!
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
# supress SettingWithCopyWarning. In future use loc instead of iloc when editing df's
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
# supress SettingWithCopyWarning. In future use loc instead of iloc when editing df's
In [91]:
Copied!
# the series we want to run the pipeline over
series_name = "dummy_series"
# the data files
data = "dummy_data.csv"
user_data = "TotalRegisteredUsers_dummy.csv"
# the series we want to run the pipeline over
series_name = "dummy_series"
# the data files
data = "dummy_data.csv"
user_data = "TotalRegisteredUsers_dummy.csv"
In [92]:
Copied!
app_data = read_file(data)
app_data = read_file(data)
In [135]:
Copied!
app_data.head(5) #time series data must have a date format of dd/mm/yyyy.
app_data.head(5) #time series data must have a date format of dd/mm/yyyy.
Out[135]:
Daily | dummy_series | |
---|---|---|
0 | 01/01/2021 | 46.323724 |
1 | 02/01/2021 | 9.898171 |
2 | 03/01/2021 | 10.763911 |
3 | 04/01/2021 | 319.980215 |
4 | 05/01/2021 | 276.882993 |
In [93]:
Copied!
# preprocess and resample into weekly data
ts_data = preprocess(app_data, series_name)
ts_data_weekly = resample(ts_data, "W")
ts_data_weekly = ts_data_weekly.iloc[:-1]
# preprocess and resample into weekly data
ts_data = preprocess(app_data, series_name)
ts_data_weekly = resample(ts_data, "W")
ts_data_weekly = ts_data_weekly.iloc[:-1]
In [94]:
Copied!
# where we want to start assessing model performance from.
weekly_measure_from = "2022-03-27"
monthly_measure_from = "2022-03-31"
# where we want to start assessing model performance from.
weekly_measure_from = "2022-03-27"
monthly_measure_from = "2022-03-31"
Which is the best 1 week out model?¶
In [95]:
Copied!
# run the models for 1,2,3 and 4 week out forecasts and add the outputs to a list.
weekly_model_list_nstep_4 = []
weekly_model_list_nstep_3 = []
weekly_model_list_nstep_2 = []
weekly_model_list_nstep_1 = []
weekly_model_list_nstep_i = [
weekly_model_list_nstep_1,
weekly_model_list_nstep_2,
weekly_model_list_nstep_3,
weekly_model_list_nstep_4,
]
# run the models for 1,2,3 and 4 week out forecasts and add the outputs to a list.
weekly_model_list_nstep_4 = []
weekly_model_list_nstep_3 = []
weekly_model_list_nstep_2 = []
weekly_model_list_nstep_1 = []
weekly_model_list_nstep_i = [
weekly_model_list_nstep_1,
weekly_model_list_nstep_2,
weekly_model_list_nstep_3,
weekly_model_list_nstep_4,
]
In [96]:
Copied!
for i in range(4):
nsteps = i + 1
SARIMA = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA)
SARIMA_bh = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=True,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_bh)
SARIMA_bh_adj = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=True,
bh_scale=True,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_bh_adj)
SARIMA_basic = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_basic)
lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_weekly,
window_end_date=weekly_measure_from,
lag=1,
num_lags=1,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(lagged_av_1)
lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_weekly,
window_end_date=weekly_measure_from,
lag=1,
num_lags=3,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(lagged_av_3)
simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_weekly, window_end_date=weekly_measure_from, nsteps=nsteps
)
weekly_model_list_nstep_i[i].append(simple_growth)
ensemble = rolling_forecast_ensemble([SARIMA, lagged_av_1])
weekly_model_list_nstep_i[i].append(ensemble)
for i in range(4):
nsteps = i + 1
SARIMA = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA)
SARIMA_bh = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=True,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_bh)
SARIMA_bh_adj = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=True,
bh_scale=True,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_bh_adj)
SARIMA_basic = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(SARIMA_basic)
lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_weekly,
window_end_date=weekly_measure_from,
lag=1,
num_lags=1,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(lagged_av_1)
lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_weekly,
window_end_date=weekly_measure_from,
lag=1,
num_lags=3,
nsteps=nsteps,
)
weekly_model_list_nstep_i[i].append(lagged_av_3)
simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_weekly, window_end_date=weekly_measure_from, nsteps=nsteps
)
weekly_model_list_nstep_i[i].append(simple_growth)
ensemble = rolling_forecast_ensemble([SARIMA, lagged_av_1])
weekly_model_list_nstep_i[i].append(ensemble)
100 % done
In [97]:
Copied!
number_of_weekly_models = len(weekly_model_list_nstep_i[0])
number_of_weekly_models = len(weekly_model_list_nstep_i[0])
In [98]:
Copied!
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_weekly_prophet_nsteps_1"
)
infile = open(file_path, "rb")
weekly_prophet = pickle.load(infile)
infile.close()
weekly_model_list_nstep_i[0].append(weekly_prophet)
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_weekly_prophet_nsteps_1"
)
infile = open(file_path, "rb")
weekly_prophet = pickle.load(infile)
infile.close()
weekly_model_list_nstep_i[0].append(weekly_prophet)
In [99]:
Copied!
# see which is the best 1 week out model
performance_df = pd.DataFrame()
for model in weekly_model_list_nstep_i[0]:
performance_df = pd.concat(
[performance_df, performance_metrics(model[0], performance_lag=12)], axis=0
)
performance_df = performance_df.reset_index().drop("index", axis=1)
performance_df
# see which is the best 1 week out model
performance_df = pd.DataFrame()
for model in weekly_model_list_nstep_i[0]:
performance_df = pd.concat(
[performance_df, performance_metrics(model[0], performance_lag=12)], axis=0
)
performance_df = performance_df.reset_index().drop("index", axis=1)
performance_df
Out[99]:
model | mae | rmse | mape | |
---|---|---|---|---|
0 | (0,1,1)(0,1,0)(52)_nsteps_1 | 124.586537 | 146.311864 | 0.027603 |
1 | (0,1,1)(0,1,0)(52)_bh_nsteps_1 | 98.696309 | 124.022797 | 0.021930 |
2 | (0,1,1)(0,1,0)(52)_bh_scale_nsteps_1 | 99.229810 | 130.201062 | 0.022068 |
3 | (0,1,0)(0,1,0)(52)_nsteps_1 | 134.268798 | 159.653457 | 0.029609 |
4 | num_lags_1_nsteps_1 | 89.483896 | 100.889996 | 0.019522 |
5 | num_lags_3_nsteps_1 | 88.406553 | 111.969744 | 0.019290 |
6 | LY_growth_nsteps_1 | 280.278605 | 314.921135 | 0.061548 |
7 | 2_Model_Ensemble | 92.376607 | 106.278228 | 0.020344 |
8 | prophet_nsteps_1 | 78.369906 | 96.137857 | 0.017295 |
Historic performance for some of the weekly models¶
In [100]:
Copied!
# get all the historic forecasts into a big dataframe
validation_df = ts_data_weekly.copy()
for model in weekly_model_list_nstep_i[0]:
validation_df[model[2]] = model[0][model[2]]
# get all the historic forecasts into a big dataframe
validation_df = ts_data_weekly.copy()
for model in weekly_model_list_nstep_i[0]:
validation_df[model[2]] = model[0][model[2]]
In [101]:
Copied!
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title=series_name + " Weekly Model Performance", xlabel="Date", ylabel=series_name
)
ax1 = plt.plot(validation_df.iloc[:,0], validation_df[series_name], color="blue")
ax4 = plt.plot(
validation_df.iloc[:,0], validation_df["num_lags_3_nsteps_1"], color="orange"
)
ax5 = plt.plot(
validation_df.iloc[:,0], validation_df["(0,1,1)(0,1,0)(52)_nsteps_1"], color="green"
)
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title=series_name + " Weekly Model Performance", xlabel="Date", ylabel=series_name
)
ax1 = plt.plot(validation_df.iloc[:,0], validation_df[series_name], color="blue")
ax4 = plt.plot(
validation_df.iloc[:,0], validation_df["num_lags_3_nsteps_1"], color="orange"
)
ax5 = plt.plot(
validation_df.iloc[:,0], validation_df["(0,1,1)(0,1,0)(52)_nsteps_1"], color="green"
)
plt.show()
4 week out forecast performance change with each week gained¶
In [102]:
Copied!
# see how a 4 week out forecast improves with 0,1,2 and 3 weeks of data gained.
# weekly_model_list_out_4_nstep_4 means we're forecasting nsteps = 4 ahead so this is where the 0 weeks of gained data goes.
# weekly_model_list_out_4_nstep_3 means we're forecasting nsteps = 3 ahead so this is where the 1 week of gained data goes.
# etc...
weekly_model_list_out_4_nstep_4 = []
weekly_model_list_out_4_nstep_3 = []
weekly_model_list_out_4_nstep_2 = []
weekly_model_list_out_4_nstep_1 = []
weekly_model_list_out_4_nstep_i = [
weekly_model_list_out_4_nstep_1,
weekly_model_list_out_4_nstep_2,
weekly_model_list_out_4_nstep_3,
weekly_model_list_out_4_nstep_4,
]
# see how a 4 week out forecast improves with 0,1,2 and 3 weeks of data gained.
# weekly_model_list_out_4_nstep_4 means we're forecasting nsteps = 4 ahead so this is where the 0 weeks of gained data goes.
# weekly_model_list_out_4_nstep_3 means we're forecasting nsteps = 3 ahead so this is where the 1 week of gained data goes.
# etc...
weekly_model_list_out_4_nstep_4 = []
weekly_model_list_out_4_nstep_3 = []
weekly_model_list_out_4_nstep_2 = []
weekly_model_list_out_4_nstep_1 = []
weekly_model_list_out_4_nstep_i = [
weekly_model_list_out_4_nstep_1,
weekly_model_list_out_4_nstep_2,
weekly_model_list_out_4_nstep_3,
weekly_model_list_out_4_nstep_4,
]
In [103]:
Copied!
for i in range(4):
for j in range(number_of_weekly_models):
nsteps = i + 1
weekly_model_list_out_4_nstep_i[i].append(
n_weeks_forecast(
weekly_model_list_nstep_i[i][j][0], nsteps=nsteps, n_week_sum=4
)
)
for i in range(4):
for j in range(number_of_weekly_models):
nsteps = i + 1
weekly_model_list_out_4_nstep_i[i].append(
n_weeks_forecast(
weekly_model_list_nstep_i[i][j][0], nsteps=nsteps, n_week_sum=4
)
)
In [104]:
Copied!
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_weekly_prophet_out_4_nsteps_i"
)
infile = open(file_path, "rb")
weekly_prophet_out_4_nsteps_i = pickle.load(infile)
infile.close()
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_weekly_prophet_out_4_nsteps_i"
)
infile = open(file_path, "rb")
weekly_prophet_out_4_nsteps_i = pickle.load(infile)
infile.close()
In [105]:
Copied!
for i in range(len(weekly_model_list_out_4_nstep_i)):
weekly_model_list_out_4_nstep_i[i].append(weekly_prophet_out_4_nsteps_i[i])
for i in range(len(weekly_model_list_out_4_nstep_i)):
weekly_model_list_out_4_nstep_i[i].append(weekly_prophet_out_4_nsteps_i[i])
In [106]:
Copied!
# for each of the weekly_model_list_out_4_nstep_i, create a model performance dataframe
performance_df_out_4_nsteps_1 = pd.DataFrame()
performance_df_out_4_nsteps_2 = pd.DataFrame()
performance_df_out_4_nsteps_3 = pd.DataFrame()
performance_df_out_4_nsteps_4 = pd.DataFrame()
performance_df_out_4_nsteps_i = [
performance_df_out_4_nsteps_1,
performance_df_out_4_nsteps_2,
performance_df_out_4_nsteps_3,
performance_df_out_4_nsteps_4,
]
# for each of the weekly_model_list_out_4_nstep_i, create a model performance dataframe
performance_df_out_4_nsteps_1 = pd.DataFrame()
performance_df_out_4_nsteps_2 = pd.DataFrame()
performance_df_out_4_nsteps_3 = pd.DataFrame()
performance_df_out_4_nsteps_4 = pd.DataFrame()
performance_df_out_4_nsteps_i = [
performance_df_out_4_nsteps_1,
performance_df_out_4_nsteps_2,
performance_df_out_4_nsteps_3,
performance_df_out_4_nsteps_4,
]
In [107]:
Copied!
for i in range(len(performance_df_out_4_nsteps_i)):
for model in weekly_model_list_out_4_nstep_i[i]:
performance_df_out_4_nsteps_i[i] = pd.concat(
[
performance_df_out_4_nsteps_i[i],
performance_metrics(model, performance_lag=12),
],
axis=0,
)
performance_df_out_4_nsteps_i[i] = (
performance_df_out_4_nsteps_i[i].reset_index().drop("index", axis=1)
)
for i in range(len(performance_df_out_4_nsteps_i)):
for model in weekly_model_list_out_4_nstep_i[i]:
performance_df_out_4_nsteps_i[i] = pd.concat(
[
performance_df_out_4_nsteps_i[i],
performance_metrics(model, performance_lag=12),
],
axis=0,
)
performance_df_out_4_nsteps_i[i] = (
performance_df_out_4_nsteps_i[i].reset_index().drop("index", axis=1)
)
In [108]:
Copied!
# as an example here is the model summary with 0 weeks of data gained (same formt as before)
performance_df_out_4_nsteps_i[3]
# as an example here is the model summary with 0 weeks of data gained (same formt as before)
performance_df_out_4_nsteps_i[3]
Out[108]:
model | mae | rmse | mape | |
---|---|---|---|---|
0 | (0,1,1)(0,1,0)(52)_nsteps_4 | 353.171879 | 408.842223 | 0.018768 |
1 | (0,1,1)(0,1,0)(52)_bh_nsteps_4 | 191.792166 | 223.918597 | 0.010198 |
2 | (0,1,1)(0,1,0)(52)_bh_scale_nsteps_4 | 200.388121 | 243.450806 | 0.010673 |
3 | (0,1,0)(0,1,0)(52)_nsteps_4 | 545.775417 | 641.017678 | 0.029175 |
4 | num_lags_1_nsteps_4 | 396.441206 | 529.114598 | 0.021258 |
5 | num_lags_3_nsteps_4 | 455.118347 | 538.225582 | 0.024047 |
6 | LY_growth_nsteps_4 | 849.226149 | 1023.395675 | 0.045192 |
7 | 2_Model_Ensemble | 307.091645 | 433.848036 | 0.016471 |
8 | prophet_nsteps_4 | 120.931137 | 142.919318 | 0.006422 |
In [109]:
Copied!
# create a df of the mape values for each additional week gained
mape_df = error_df(
model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
number_of_known_weeks=[0, 1, 2, 3],
metric="mape",
)
# create a df of the mape values for each additional week gained
mape_df = error_df(
model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
number_of_known_weeks=[0, 1, 2, 3],
metric="mape",
)
In [110]:
Copied!
# create a df of the rmse values for each additional week gained
rmse_df = error_df(
model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
number_of_known_weeks=[0, 1, 2, 3],
metric="rmse",
)
# create a df of the rmse values for each additional week gained
rmse_df = error_df(
model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
number_of_known_weeks=[0, 1, 2, 3],
metric="rmse",
)
In [111]:
Copied!
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title="Weekly Models Mape Reduction", xlabel="number of known weeks", ylabel="MAPE"
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))
col_names = mape_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
plt.plot(mape_df["number of known weeks"], mape_df[col], label=col)
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title="Weekly Models Mape Reduction", xlabel="number of known weeks", ylabel="MAPE"
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))
col_names = mape_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
plt.plot(mape_df["number of known weeks"], mape_df[col], label=col)
plt.legend()
plt.show()
In [112]:
Copied!
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title="Weekly Models RMSE Reduction", xlabel="number of known weeks", ylabel="RMSE"
)
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))
col_names = rmse_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
plt.plot(rmse_df["number of known weeks"], rmse_df[col], label=col)
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(
title="Weekly Models RMSE Reduction", xlabel="number of known weeks", ylabel="RMSE"
)
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))
col_names = rmse_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
plt.plot(rmse_df["number of known weeks"], rmse_df[col], label=col)
plt.legend()
plt.show()
A look at the Christmas period by week¶
In [113]:
Copied!
# take the weekly SARIMA model out 13 steps. Note this may not be the best weelky model for every series!
weekly_sarima_nsteps_13 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date="2022-10-30",
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=13,
)
# take the weekly SARIMA model out 13 steps. Note this may not be the best weelky model for every series!
weekly_sarima_nsteps_13 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date="2022-10-30",
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=13,
)
100 % done
In [114]:
Copied!
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Week", xlabel="Date", ylabel=series_name)
ax = plt.plot(
weekly_sarima_nsteps_13[0].iloc[:, 0].tail(20),
weekly_sarima_nsteps_13[0].iloc[:, 1].tail(20),
)
ax = plt.plot(
weekly_sarima_nsteps_13[1].iloc[:, 0], weekly_sarima_nsteps_13[1].iloc[:, 1]
)
plt.fill_between(
weekly_sarima_nsteps_13[3].iloc[:, 0],
weekly_sarima_nsteps_13[3].iloc[:, 1],
weekly_sarima_nsteps_13[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Week", xlabel="Date", ylabel=series_name)
ax = plt.plot(
weekly_sarima_nsteps_13[0].iloc[:, 0].tail(20),
weekly_sarima_nsteps_13[0].iloc[:, 1].tail(20),
)
ax = plt.plot(
weekly_sarima_nsteps_13[1].iloc[:, 0], weekly_sarima_nsteps_13[1].iloc[:, 1]
)
plt.fill_between(
weekly_sarima_nsteps_13[3].iloc[:, 0],
weekly_sarima_nsteps_13[3].iloc[:, 1],
weekly_sarima_nsteps_13[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
Which is the best 1 month out model?¶
In [115]:
Copied!
# use the weekly SARIMA model to make monthly forecasts (4 weeks ~ 1 month)
weekly_SARIMA_nsteps_4 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=4,
)
# use the weekly SARIMA model to make monthly forecasts (4 weeks ~ 1 month)
weekly_SARIMA_nsteps_4 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=4,
)
100 % done
In [116]:
Copied!
# to aggregate weekly forecasts into monthly ones we need to know how to distribute the weeks which overlap two months
# into each of the months correctly
distr = weekly_distribution(app_data, series_name)
# to aggregate weekly forecasts into monthly ones we need to know how to distribute the weeks which overlap two months
# into each of the months correctly
distr = weekly_distribution(app_data, series_name)
/Users/administrator/Documents/Milans_Forecasting_Repo/src/Model Selection/../Functions/aggregate_functions.py:35: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function. dummy_data = dummy_data.groupby(
In [117]:
Copied!
# aggregate the weekly forecasts into monthly ones
start = weekly_SARIMA_nsteps_4[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_4[0])
weekly_SARIMA_nsteps_4_trimmed = (
weekly_SARIMA_nsteps_4[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_4_trimmed, distr)
# aggregate the weekly forecasts into monthly ones
start = weekly_SARIMA_nsteps_4[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_4[0])
weekly_SARIMA_nsteps_4_trimmed = (
weekly_SARIMA_nsteps_4[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_4_trimmed, distr)
In [118]:
Copied!
# see the performance by month
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)
# see the performance by month
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)
In [119]:
Copied!
# resample the data as monthly
ts_data_monthly = resample(preprocess(app_data, series_name), "M")
# resample the data as monthly
ts_data_monthly = resample(preprocess(app_data, series_name), "M")
In [120]:
Copied!
# run some other monthly models as a point of comparison and get their respective performances by month
monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date=monthly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=1,
)
pm2 = performance_metrics(monthly_SARIMA[0], performance_lag=7, summarise=False)
monthly_lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=1,
nsteps=1,
)
pm3 = performance_metrics(monthly_lagged_av_1[0], performance_lag=7, summarise=False)
monthly_lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=3,
nsteps=1,
)
pm4 = performance_metrics(monthly_lagged_av_3[0], performance_lag=7, summarise=False)
monthly_simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=1
)
pm5 = performance_metrics(monthly_simple_growth[0], performance_lag=7, summarise=False)
monthly_ensemble = rolling_forecast_ensemble([monthly_SARIMA, monthly_lagged_av_1])
pm6 = performance_metrics(monthly_ensemble[0], performance_lag=7, summarise=False)
# run some other monthly models as a point of comparison and get their respective performances by month
monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date=monthly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=1,
)
pm2 = performance_metrics(monthly_SARIMA[0], performance_lag=7, summarise=False)
monthly_lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=1,
nsteps=1,
)
pm3 = performance_metrics(monthly_lagged_av_1[0], performance_lag=7, summarise=False)
monthly_lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=3,
nsteps=1,
)
pm4 = performance_metrics(monthly_lagged_av_3[0], performance_lag=7, summarise=False)
monthly_simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=1
)
pm5 = performance_metrics(monthly_simple_growth[0], performance_lag=7, summarise=False)
monthly_ensemble = rolling_forecast_ensemble([monthly_SARIMA, monthly_lagged_av_1])
pm6 = performance_metrics(monthly_ensemble[0], performance_lag=7, summarise=False)
100 % done
In [121]:
Copied!
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet = pickle.load(infile)
infile.close()
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet = pickle.load(infile)
infile.close()
In [122]:
Copied!
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet, pm6], "perc error")
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet, pm6], "perc error")
Out[122]:
Daily | (0,1,1)(0,1,0)(52)_nsteps_4 | (0,1,0)(0,1,0)(12)_nsteps_1 | num_lags_1_nsteps_1 | num_lags_3_nsteps_1 | LY_growth_nsteps_1 | prophet_nsteps_4 | 2_Model_Ensemble | |
---|---|---|---|---|---|---|---|---|
0 | 2022-04-30 | 0.004359 | 0.013114 | 0.016558 | 0.120906 | 0.009585 | 0.006213 | 0.014836 |
1 | 2022-05-31 | 0.008269 | 0.024531 | 0.054608 | 0.103234 | 0.005714 | 0.001580 | 0.039570 |
2 | 2022-06-30 | 0.000893 | 0.042438 | 0.079931 | 0.046250 | 0.005164 | 0.009493 | 0.061184 |
3 | 2022-07-31 | 0.013536 | 0.055924 | 0.027824 | 0.062389 | 0.138324 | 0.025953 | 0.014050 |
4 | 2022-08-31 | 0.044651 | 0.013458 | 0.011424 | 0.057883 | 0.016108 | 0.007904 | 0.012441 |
5 | 2022-09-30 | 0.000153 | 0.031160 | 0.064928 | 0.083028 | 0.011586 | 0.002777 | 0.048044 |
6 | 2022-10-31 | 0.109748 | 0.031588 | 0.077994 | 0.034346 | 0.028391 | 0.121028 | 0.054791 |
Which is the best 3 month out model?¶
In [123]:
Copied!
# do exactly the same as before but now forecasting 3 months ahead instead of 1
weekly_SARIMA_nsteps_12 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=12,
)
start = weekly_SARIMA_nsteps_12[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_12[0])
weekly_SARIMA_nsteps_12_trimmed = (
weekly_SARIMA_nsteps_12[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_12_trimmed, distr)
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)
_3_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date=monthly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=3,
)
pm2 = performance_metrics(_3_monthly_SARIMA[0], performance_lag=7, summarise=False)
_3_monthly_lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=1,
nsteps=3,
)
pm3 = performance_metrics(_3_monthly_lagged_av_1[0], performance_lag=7, summarise=False)
_3_monthly_lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=3,
nsteps=3,
)
pm4 = performance_metrics(_3_monthly_lagged_av_3[0], performance_lag=7, summarise=False)
_3_monthly_simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=3
)
pm5 = performance_metrics(
_3_monthly_simple_growth[0], performance_lag=7, summarise=False
)
_3_monthly_ensemble = rolling_forecast_ensemble(
[_3_monthly_SARIMA, _3_monthly_lagged_av_1]
)
pm6 = performance_metrics(_3_monthly_ensemble[0], performance_lag=7, summarise=False)
# do exactly the same as before but now forecasting 3 months ahead instead of 1
weekly_SARIMA_nsteps_12 = rolling_forecast_SARIMA(
ts_data_weekly,
window_end_date=weekly_measure_from,
model_params=[0, 1, 1, 0, 1, 0, 52],
bh_adj=False,
bh_scale=False,
nsteps=12,
)
start = weekly_SARIMA_nsteps_12[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_12[0])
weekly_SARIMA_nsteps_12_trimmed = (
weekly_SARIMA_nsteps_12[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_12_trimmed, distr)
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)
_3_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date=monthly_measure_from,
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=3,
)
pm2 = performance_metrics(_3_monthly_SARIMA[0], performance_lag=7, summarise=False)
_3_monthly_lagged_av_1 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=1,
nsteps=3,
)
pm3 = performance_metrics(_3_monthly_lagged_av_1[0], performance_lag=7, summarise=False)
_3_monthly_lagged_av_3 = rolling_forecast_lagged_average(
time_series=ts_data_monthly,
window_end_date=monthly_measure_from,
lag=1,
num_lags=3,
nsteps=3,
)
pm4 = performance_metrics(_3_monthly_lagged_av_3[0], performance_lag=7, summarise=False)
_3_monthly_simple_growth = rolling_forecast_LY_perc_inc(
time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=3
)
pm5 = performance_metrics(
_3_monthly_simple_growth[0], performance_lag=7, summarise=False
)
_3_monthly_ensemble = rolling_forecast_ensemble(
[_3_monthly_SARIMA, _3_monthly_lagged_av_1]
)
pm6 = performance_metrics(_3_monthly_ensemble[0], performance_lag=7, summarise=False)
100 % done
In [124]:
Copied!
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_3_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet_3 = pickle.load(infile)
infile.close()
file_path = (
os.path.split(os.getcwd())[0]
+ "/Prophet Scripts/"
+ series_name
+ "_3_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet_3 = pickle.load(infile)
infile.close()
In [125]:
Copied!
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet_3, pm6], "perc error")
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet_3, pm6], "perc error")
Out[125]:
Daily | (0,1,1)(0,1,0)(52)_nsteps_12 | (0,1,0)(0,1,0)(12)_nsteps_3 | num_lags_1_nsteps_3 | num_lags_3_nsteps_3 | LY_growth_nsteps_3 | prophet_nsteps_12 | 2_Model_Ensemble | |
---|---|---|---|---|---|---|---|---|
0 | 2022-04-30 | 0.008070 | 0.013114 | 0.016558 | 0.120906 | 0.009585 | 0.007719 | 0.014836 |
1 | 2022-05-31 | 0.001587 | 0.012133 | 0.038955 | 0.141335 | 0.015353 | 0.002542 | 0.025544 |
2 | 2022-06-30 | 0.003918 | 0.029335 | 0.037863 | 0.045773 | 0.020596 | 0.008637 | 0.033599 |
3 | 2022-07-31 | 0.041859 | 0.055924 | 0.027824 | 0.062389 | 0.138324 | 0.031473 | 0.014050 |
4 | 2022-08-31 | 0.005589 | 0.043104 | 0.039566 | 0.078917 | 0.124444 | 0.004540 | 0.001769 |
5 | 2022-09-30 | 0.013250 | 0.014743 | 0.107063 | 0.133442 | 0.134589 | 0.004093 | 0.046160 |
6 | 2022-10-31 | 0.116619 | 0.031588 | 0.077994 | 0.034346 | 0.028391 | 0.121416 | 0.054791 |
A 3 month out forecast¶
In [126]:
Copied!
# take the monthly SARIMA model out 3 steps. Note this may not be the best weelky model for every series!
_3_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date="2022-10-31",
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=3,
)
# take the monthly SARIMA model out 3 steps. Note this may not be the best weelky model for every series!
_3_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date="2022-10-31",
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=3,
)
100 % done
In [127]:
Copied!
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)
ax = plt.plot(
_3_monthly_SARIMA[0].iloc[:, 0].tail(20), _3_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_3_monthly_SARIMA[1].iloc[:, 0], _3_monthly_SARIMA[1].iloc[:, 1])
plt.fill_between(
_3_monthly_SARIMA[3].iloc[:, 0],
_3_monthly_SARIMA[3].iloc[:, 1],
_3_monthly_SARIMA[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)
ax = plt.plot(
_3_monthly_SARIMA[0].iloc[:, 0].tail(20), _3_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_3_monthly_SARIMA[1].iloc[:, 0], _3_monthly_SARIMA[1].iloc[:, 1])
plt.fill_between(
_3_monthly_SARIMA[3].iloc[:, 0],
_3_monthly_SARIMA[3].iloc[:, 1],
_3_monthly_SARIMA[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
Longterm Forecasts¶
In [128]:
Copied!
registered_users = read_file(user_data)
registered_users = read_file(user_data)
In [137]:
Copied!
registered_users.head(5) #registered users data must have a date format of dd/mm/yyyy. The headers must also be the same as below.
registered_users.head(5) #registered users data must have a date format of dd/mm/yyyy. The headers must also be the same as below.
Out[137]:
Date | TotalRegisteredUsers | |
---|---|---|
0 | 31/12/2018 | 3280851.064 |
1 | 31/01/2019 | 4148936.170 |
2 | 28/02/2019 | 5004255.319 |
3 | 31/03/2019 | 5846808.511 |
4 | 30/04/2019 | 6676595.745 |
In [129]:
Copied!
# get the dampened uptake data
uptake_data = dampen(registered_users, n=10)
# get the dampened uptake data
uptake_data = dampen(registered_users, n=10)
/Users/administrator/opt/miniconda3/envs/forecast-lab/lib/python3.9/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log result = getattr(ufunc, method)(*inputs, **kwargs)
In [130]:
Copied!
uptake_data = uptake_data.drop(columns=["TotalRegisteredUsers"])
uptake_data = uptake_data.drop(columns=["TotalRegisteredUsers"])
In [131]:
Copied!
# take the monthly SARIMA model out 25 steps. Note this may not be the best weelky model for every series!
_24_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date="2022-10-31",
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=25,
)
# take the monthly SARIMA model out 25 steps. Note this may not be the best weelky model for every series!
_24_monthly_SARIMA = rolling_forecast_SARIMA(
ts_data_monthly,
window_end_date="2022-10-31",
model_params=[0, 1, 0, 0, 1, 0, 12],
bh_adj=False,
bh_scale=False,
nsteps=25,
)
100 % done
In [132]:
Copied!
# dampen the forecasts out to account for the fact app uptake will slow over time.
trend_adj_forecast = trend_adjuster(
_24_monthly_SARIMA[0], _24_monthly_SARIMA[1], uptake_data
)
# dampen the forecasts out to account for the fact app uptake will slow over time.
trend_adj_forecast = trend_adjuster(
_24_monthly_SARIMA[0], _24_monthly_SARIMA[1], uptake_data
)
In [133]:
Copied!
plt.figure(figsize=(12, 5))
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)
ax = plt.plot(
_24_monthly_SARIMA[0].iloc[:, 0].tail(20), _24_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_24_monthly_SARIMA[1].iloc[:, 0], _24_monthly_SARIMA[1].iloc[:, 1])
ax = plt.plot(trend_adj_forecast.iloc[:, 0], trend_adj_forecast.iloc[:, 1])
plt.fill_between(
_24_monthly_SARIMA[3].iloc[:, 0],
_24_monthly_SARIMA[3].iloc[:, 1],
_24_monthly_SARIMA[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
plt.figure(figsize=(12, 5))
fig, ax = plt.subplots(figsize=(12, 5))
ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)
ax = plt.plot(
_24_monthly_SARIMA[0].iloc[:, 0].tail(20), _24_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_24_monthly_SARIMA[1].iloc[:, 0], _24_monthly_SARIMA[1].iloc[:, 1])
ax = plt.plot(trend_adj_forecast.iloc[:, 0], trend_adj_forecast.iloc[:, 1])
plt.fill_between(
_24_monthly_SARIMA[3].iloc[:, 0],
_24_monthly_SARIMA[3].iloc[:, 1],
_24_monthly_SARIMA[3].iloc[:, 2],
color="r",
alpha=0.1,
)
plt.show()
<Figure size 1200x500 with 0 Axes>
In [139]:
Copied!
# write all the above code to a py file but not this particular cell of code.
!jupyter nbconvert --to script pipeline_dummy.ipynb
with open("pipeline_dummy.py", "r") as f:
lines = f.readlines()
with open("pipeline_dummy.py", "w") as f:
for line in lines:
if "nbconvert --to script" in line:
break
else:
f.write(line)
# write all the above code to a py file but not this particular cell of code.
!jupyter nbconvert --to script pipeline_dummy.ipynb
with open("pipeline_dummy.py", "r") as f:
lines = f.readlines()
with open("pipeline_dummy.py", "w") as f:
for line in lines:
if "nbconvert --to script" in line:
break
else:
f.write(line)
[NbConvertApp] Converting notebook pipeline_dummy.ipynb to script [NbConvertApp] Writing 19043 bytes to pipeline_dummy.py
In [ ]:
Copied!