Pipeline¶

In [85]:

                
                    Copied!
                    
import os, sys, importlib
import os, sys, importlib

In [86]:

                
                    Copied!
                    
# need this so we can import from different folders
sys.path.append("../Functions")
# need this so we can import from different folders
sys.path.append("../Functions")

In [87]:

                
                    Copied!
                    
sys.path.append("../Models")
sys.path.append("../Models")

In [88]:

                
                    Copied!
                    
from preprocessing_functions import *
from performance_functions import *
from aggregate_functions import *
from dampner import dampen, trend_adjuster
from preprocessing_functions import *
from performance_functions import *
from aggregate_functions import *
from dampner import dampen, trend_adjuster

In [89]:

                
                    Copied!
                    
from SARIMAX import rolling_forecast_SARIMA
from simple_growth import rolling_forecast_LY_perc_inc
from lagged_average import rolling_forecast_lagged_average
from holt_winters import rolling_forecast_holt_winters
from ensemble import rolling_forecast_ensemble
from SARIMAX import rolling_forecast_SARIMA
from simple_growth import rolling_forecast_LY_perc_inc
from lagged_average import rolling_forecast_lagged_average
from holt_winters import rolling_forecast_holt_winters
from ensemble import rolling_forecast_ensemble

In [90]:

                
                    Copied!
                    
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'
# supress SettingWithCopyWarning. In future use loc instead of iloc when editing df's
import pickle
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'
# supress SettingWithCopyWarning. In future use loc instead of iloc when editing df's

In [91]:

                
                    Copied!
                    
# the series we want to run the pipeline over
series_name = "dummy_series"

# the data files
data = "dummy_data.csv"
user_data = "TotalRegisteredUsers_dummy.csv"
# the series we want to run the pipeline over
series_name = "dummy_series"

# the data files
data = "dummy_data.csv"
user_data = "TotalRegisteredUsers_dummy.csv"

In [92]:

                
                    Copied!
                    
app_data = read_file(data)
app_data = read_file(data)

In [135]:

                
                    Copied!
                    
app_data.head(5) #time series data must have a date format of dd/mm/yyyy.
app_data.head(5) #time series data must have a date format of dd/mm/yyyy.

Out[135]:

	Daily	dummy_series
0	01/01/2021	46.323724
1	02/01/2021	9.898171
2	03/01/2021	10.763911
3	04/01/2021	319.980215
4	05/01/2021	276.882993

In [93]:

                
                    Copied!
                    
# preprocess and resample into weekly data
ts_data = preprocess(app_data, series_name)
ts_data_weekly = resample(ts_data, "W")
ts_data_weekly = ts_data_weekly.iloc[:-1]
# preprocess and resample into weekly data
ts_data = preprocess(app_data, series_name)
ts_data_weekly = resample(ts_data, "W")
ts_data_weekly = ts_data_weekly.iloc[:-1]

In [94]:

                
                    Copied!
                    
# where we want to start assessing model performance from.
weekly_measure_from = "2022-03-27"
monthly_measure_from = "2022-03-31"
# where we want to start assessing model performance from.
weekly_measure_from = "2022-03-27"
monthly_measure_from = "2022-03-31"

Which is the best 1 week out model?¶

In [95]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# run the models for 1,2,3 and 4 week out forecasts and add the outputs to a list.

weekly_model_list_nstep_4 = []
weekly_model_list_nstep_3 = []
weekly_model_list_nstep_2 = []
weekly_model_list_nstep_1 = []

weekly_model_list_nstep_i = [
    weekly_model_list_nstep_1,
    weekly_model_list_nstep_2,
    weekly_model_list_nstep_3,
    weekly_model_list_nstep_4,
]
# run the models for 1,2,3 and 4 week out forecasts and add the outputs to a list.

weekly_model_list_nstep_4 = []
weekly_model_list_nstep_3 = []
weekly_model_list_nstep_2 = []
weekly_model_list_nstep_1 = []

weekly_model_list_nstep_i = [
    weekly_model_list_nstep_1,
    weekly_model_list_nstep_2,
    weekly_model_list_nstep_3,
    weekly_model_list_nstep_4,
]

In [96]:

                
                    Copied!
                    
                        
                        
                    
                    

            
for i in range(4):

    nsteps = i + 1

    SARIMA = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=False,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA)

    SARIMA_bh = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=True,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_bh)

    SARIMA_bh_adj = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=True,
        bh_scale=True,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_bh_adj)

    SARIMA_basic = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 0, 0, 1, 0, 52],
        bh_adj=False,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_basic)

    lagged_av_1 = rolling_forecast_lagged_average(
        time_series=ts_data_weekly,
        window_end_date=weekly_measure_from,
        lag=1,
        num_lags=1,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(lagged_av_1)

    lagged_av_3 = rolling_forecast_lagged_average(
        time_series=ts_data_weekly,
        window_end_date=weekly_measure_from,
        lag=1,
        num_lags=3,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(lagged_av_3)

    simple_growth = rolling_forecast_LY_perc_inc(
        time_series=ts_data_weekly, window_end_date=weekly_measure_from, nsteps=nsteps
    )
    weekly_model_list_nstep_i[i].append(simple_growth)

    ensemble = rolling_forecast_ensemble([SARIMA, lagged_av_1])

    weekly_model_list_nstep_i[i].append(ensemble)
for i in range(4):

    nsteps = i + 1

    SARIMA = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=False,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA)

    SARIMA_bh = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=True,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_bh)

    SARIMA_bh_adj = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 1, 0, 1, 0, 52],
        bh_adj=True,
        bh_scale=True,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_bh_adj)

    SARIMA_basic = rolling_forecast_SARIMA(
        ts_data_weekly,
        window_end_date=weekly_measure_from,
        model_params=[0, 1, 0, 0, 1, 0, 52],
        bh_adj=False,
        bh_scale=False,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(SARIMA_basic)

    lagged_av_1 = rolling_forecast_lagged_average(
        time_series=ts_data_weekly,
        window_end_date=weekly_measure_from,
        lag=1,
        num_lags=1,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(lagged_av_1)

    lagged_av_3 = rolling_forecast_lagged_average(
        time_series=ts_data_weekly,
        window_end_date=weekly_measure_from,
        lag=1,
        num_lags=3,
        nsteps=nsteps,
    )
    weekly_model_list_nstep_i[i].append(lagged_av_3)

    simple_growth = rolling_forecast_LY_perc_inc(
        time_series=ts_data_weekly, window_end_date=weekly_measure_from, nsteps=nsteps
    )
    weekly_model_list_nstep_i[i].append(simple_growth)

    ensemble = rolling_forecast_ensemble([SARIMA, lagged_av_1])

    weekly_model_list_nstep_i[i].append(ensemble)

100 % done

In [97]:

                
                    Copied!
                    
number_of_weekly_models = len(weekly_model_list_nstep_i[0])
number_of_weekly_models = len(weekly_model_list_nstep_i[0])

In [98]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_weekly_prophet_nsteps_1"
)
infile = open(file_path, "rb")
weekly_prophet = pickle.load(infile)
infile.close()
weekly_model_list_nstep_i[0].append(weekly_prophet)
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_weekly_prophet_nsteps_1"
)
infile = open(file_path, "rb")
weekly_prophet = pickle.load(infile)
infile.close()
weekly_model_list_nstep_i[0].append(weekly_prophet)

In [99]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# see which is the best 1 week out model
performance_df = pd.DataFrame()
for model in weekly_model_list_nstep_i[0]:
    performance_df = pd.concat(
        [performance_df, performance_metrics(model[0], performance_lag=12)], axis=0
    )
performance_df = performance_df.reset_index().drop("index", axis=1)
performance_df
# see which is the best 1 week out model
performance_df = pd.DataFrame()
for model in weekly_model_list_nstep_i[0]:
    performance_df = pd.concat(
        [performance_df, performance_metrics(model[0], performance_lag=12)], axis=0
    )
performance_df = performance_df.reset_index().drop("index", axis=1)
performance_df

Out[99]:

	model	mae	rmse	mape
0	(0,1,1)(0,1,0)(52)_nsteps_1	124.586537	146.311864	0.027603
1	(0,1,1)(0,1,0)(52)_bh_nsteps_1	98.696309	124.022797	0.021930
2	(0,1,1)(0,1,0)(52)_bh_scale_nsteps_1	99.229810	130.201062	0.022068
3	(0,1,0)(0,1,0)(52)_nsteps_1	134.268798	159.653457	0.029609
4	num_lags_1_nsteps_1	89.483896	100.889996	0.019522
5	num_lags_3_nsteps_1	88.406553	111.969744	0.019290
6	LY_growth_nsteps_1	280.278605	314.921135	0.061548
7	2_Model_Ensemble	92.376607	106.278228	0.020344
8	prophet_nsteps_1	78.369906	96.137857	0.017295

Historic performance for some of the weekly models¶

In [100]:

                
                    Copied!
                    
# get all the historic forecasts into a big dataframe
validation_df = ts_data_weekly.copy()
for model in weekly_model_list_nstep_i[0]:
    validation_df[model[2]] = model[0][model[2]]
# get all the historic forecasts into a big dataframe
validation_df = ts_data_weekly.copy()
for model in weekly_model_list_nstep_i[0]:
    validation_df[model[2]] = model[0][model[2]]

In [101]:

                
                    Copied!
                    
                        
                        
                    
                    

            
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title=series_name + " Weekly Model Performance", xlabel="Date", ylabel=series_name
)

ax1 = plt.plot(validation_df.iloc[:,0], validation_df[series_name], color="blue")
ax4 = plt.plot(
    validation_df.iloc[:,0], validation_df["num_lags_3_nsteps_1"], color="orange"
)
ax5 = plt.plot(
    validation_df.iloc[:,0], validation_df["(0,1,1)(0,1,0)(52)_nsteps_1"], color="green"
)

plt.show()
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title=series_name + " Weekly Model Performance", xlabel="Date", ylabel=series_name
)

ax1 = plt.plot(validation_df.iloc[:,0], validation_df[series_name], color="blue")
ax4 = plt.plot(
    validation_df.iloc[:,0], validation_df["num_lags_3_nsteps_1"], color="orange"
)
ax5 = plt.plot(
    validation_df.iloc[:,0], validation_df["(0,1,1)(0,1,0)(52)_nsteps_1"], color="green"
)

plt.show()

4 week out forecast performance change with each week gained¶

In [102]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# see how a 4 week out forecast improves with 0,1,2 and 3 weeks of data gained.
# weekly_model_list_out_4_nstep_4 means we're forecasting nsteps = 4 ahead so this is where the 0 weeks of gained data goes.
# weekly_model_list_out_4_nstep_3 means we're forecasting nsteps = 3 ahead so this is where the 1 week of gained data goes.
# etc...


weekly_model_list_out_4_nstep_4 = []
weekly_model_list_out_4_nstep_3 = []
weekly_model_list_out_4_nstep_2 = []
weekly_model_list_out_4_nstep_1 = []

weekly_model_list_out_4_nstep_i = [
    weekly_model_list_out_4_nstep_1,
    weekly_model_list_out_4_nstep_2,
    weekly_model_list_out_4_nstep_3,
    weekly_model_list_out_4_nstep_4,
]
# see how a 4 week out forecast improves with 0,1,2 and 3 weeks of data gained.
# weekly_model_list_out_4_nstep_4 means we're forecasting nsteps = 4 ahead so this is where the 0 weeks of gained data goes.
# weekly_model_list_out_4_nstep_3 means we're forecasting nsteps = 3 ahead so this is where the 1 week of gained data goes.
# etc...


weekly_model_list_out_4_nstep_4 = []
weekly_model_list_out_4_nstep_3 = []
weekly_model_list_out_4_nstep_2 = []
weekly_model_list_out_4_nstep_1 = []

weekly_model_list_out_4_nstep_i = [
    weekly_model_list_out_4_nstep_1,
    weekly_model_list_out_4_nstep_2,
    weekly_model_list_out_4_nstep_3,
    weekly_model_list_out_4_nstep_4,
]

In [103]:

                
                    Copied!
                    
for i in range(4):

    for j in range(number_of_weekly_models):

        nsteps = i + 1

        weekly_model_list_out_4_nstep_i[i].append(
            n_weeks_forecast(
                weekly_model_list_nstep_i[i][j][0], nsteps=nsteps, n_week_sum=4
            )
        )
for i in range(4):

    for j in range(number_of_weekly_models):

        nsteps = i + 1

        weekly_model_list_out_4_nstep_i[i].append(
            n_weeks_forecast(
                weekly_model_list_nstep_i[i][j][0], nsteps=nsteps, n_week_sum=4
            )
        )

In [104]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_weekly_prophet_out_4_nsteps_i"
)
infile = open(file_path, "rb")
weekly_prophet_out_4_nsteps_i = pickle.load(infile)
infile.close()
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_weekly_prophet_out_4_nsteps_i"
)
infile = open(file_path, "rb")
weekly_prophet_out_4_nsteps_i = pickle.load(infile)
infile.close()

In [105]:

                
                    Copied!
                    
for i in range(len(weekly_model_list_out_4_nstep_i)):
    weekly_model_list_out_4_nstep_i[i].append(weekly_prophet_out_4_nsteps_i[i])
for i in range(len(weekly_model_list_out_4_nstep_i)):
    weekly_model_list_out_4_nstep_i[i].append(weekly_prophet_out_4_nsteps_i[i])

In [106]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# for each of the weekly_model_list_out_4_nstep_i, create a model performance dataframe

performance_df_out_4_nsteps_1 = pd.DataFrame()
performance_df_out_4_nsteps_2 = pd.DataFrame()
performance_df_out_4_nsteps_3 = pd.DataFrame()
performance_df_out_4_nsteps_4 = pd.DataFrame()

performance_df_out_4_nsteps_i = [
    performance_df_out_4_nsteps_1,
    performance_df_out_4_nsteps_2,
    performance_df_out_4_nsteps_3,
    performance_df_out_4_nsteps_4,
]
# for each of the weekly_model_list_out_4_nstep_i, create a model performance dataframe

performance_df_out_4_nsteps_1 = pd.DataFrame()
performance_df_out_4_nsteps_2 = pd.DataFrame()
performance_df_out_4_nsteps_3 = pd.DataFrame()
performance_df_out_4_nsteps_4 = pd.DataFrame()

performance_df_out_4_nsteps_i = [
    performance_df_out_4_nsteps_1,
    performance_df_out_4_nsteps_2,
    performance_df_out_4_nsteps_3,
    performance_df_out_4_nsteps_4,
]

In [107]:

                
                    Copied!
                    
                        
                        
                    
                    

            
for i in range(len(performance_df_out_4_nsteps_i)):
    for model in weekly_model_list_out_4_nstep_i[i]:
        performance_df_out_4_nsteps_i[i] = pd.concat(
            [
                performance_df_out_4_nsteps_i[i],
                performance_metrics(model, performance_lag=12),
            ],
            axis=0,
        )
        performance_df_out_4_nsteps_i[i] = (
            performance_df_out_4_nsteps_i[i].reset_index().drop("index", axis=1)
        )
for i in range(len(performance_df_out_4_nsteps_i)):
    for model in weekly_model_list_out_4_nstep_i[i]:
        performance_df_out_4_nsteps_i[i] = pd.concat(
            [
                performance_df_out_4_nsteps_i[i],
                performance_metrics(model, performance_lag=12),
            ],
            axis=0,
        )
        performance_df_out_4_nsteps_i[i] = (
            performance_df_out_4_nsteps_i[i].reset_index().drop("index", axis=1)
        )

In [108]:

                
                    Copied!
                    
# as an example here is the model summary with 0 weeks of data gained (same formt as before)
performance_df_out_4_nsteps_i[3]
# as an example here is the model summary with 0 weeks of data gained (same formt as before)
performance_df_out_4_nsteps_i[3]

Out[108]:

	model	mae	rmse	mape
0	(0,1,1)(0,1,0)(52)_nsteps_4	353.171879	408.842223	0.018768
1	(0,1,1)(0,1,0)(52)_bh_nsteps_4	191.792166	223.918597	0.010198
2	(0,1,1)(0,1,0)(52)_bh_scale_nsteps_4	200.388121	243.450806	0.010673
3	(0,1,0)(0,1,0)(52)_nsteps_4	545.775417	641.017678	0.029175
4	num_lags_1_nsteps_4	396.441206	529.114598	0.021258
5	num_lags_3_nsteps_4	455.118347	538.225582	0.024047
6	LY_growth_nsteps_4	849.226149	1023.395675	0.045192
7	2_Model_Ensemble	307.091645	433.848036	0.016471
8	prophet_nsteps_4	120.931137	142.919318	0.006422

In [109]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# create a df of the mape values for each additional week gained
mape_df = error_df(
    model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
    number_of_known_weeks=[0, 1, 2, 3],
    metric="mape",
)
# create a df of the mape values for each additional week gained
mape_df = error_df(
    model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
    number_of_known_weeks=[0, 1, 2, 3],
    metric="mape",
)

In [110]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# create a df of the rmse values for each additional week gained
rmse_df = error_df(
    model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
    number_of_known_weeks=[0, 1, 2, 3],
    metric="rmse",
)
# create a df of the rmse values for each additional week gained
rmse_df = error_df(
    model_performance_dfs=list(reversed(performance_df_out_4_nsteps_i)),
    number_of_known_weeks=[0, 1, 2, 3],
    metric="rmse",
)

In [111]:

                
                    Copied!
                    
                        
                        
                    
                    

            
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title="Weekly Models Mape Reduction", xlabel="number of known weeks", ylabel="MAPE"
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))

col_names = mape_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
    plt.plot(mape_df["number of known weeks"], mape_df[col], label=col)
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title="Weekly Models Mape Reduction", xlabel="number of known weeks", ylabel="MAPE"
)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))

col_names = mape_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
    plt.plot(mape_df["number of known weeks"], mape_df[col], label=col)
plt.legend()
plt.show()

In [112]:

                
                    Copied!
                    
                        
                        
                    
                    

            
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title="Weekly Models RMSE Reduction", xlabel="number of known weeks", ylabel="RMSE"
)
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))

col_names = rmse_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
    plt.plot(rmse_df["number of known weeks"], rmse_df[col], label=col)
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(
    title="Weekly Models RMSE Reduction", xlabel="number of known weeks", ylabel="RMSE"
)
ax.xaxis.set_major_locator(mtick.MultipleLocator(1))

col_names = rmse_df.columns.values.tolist()
col_names.remove("number of known weeks")
for col in col_names:
    plt.plot(rmse_df["number of known weeks"], rmse_df[col], label=col)
plt.legend()
plt.show()

A look at the Christmas period by week¶

In [113]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# take the weekly SARIMA model out 13 steps. Note this may not be the best weelky model for every series!
weekly_sarima_nsteps_13 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date="2022-10-30",
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=13,
)
# take the weekly SARIMA model out 13 steps. Note this may not be the best weelky model for every series!
weekly_sarima_nsteps_13 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date="2022-10-30",
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=13,
)

100 % done

In [114]:

                
                    Copied!
                    
                        
                        
                    
                    

            
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Week", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    weekly_sarima_nsteps_13[0].iloc[:, 0].tail(20),
    weekly_sarima_nsteps_13[0].iloc[:, 1].tail(20),
)
ax = plt.plot(
    weekly_sarima_nsteps_13[1].iloc[:, 0], weekly_sarima_nsteps_13[1].iloc[:, 1]
)
plt.fill_between(
    weekly_sarima_nsteps_13[3].iloc[:, 0],
    weekly_sarima_nsteps_13[3].iloc[:, 1],
    weekly_sarima_nsteps_13[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Week", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    weekly_sarima_nsteps_13[0].iloc[:, 0].tail(20),
    weekly_sarima_nsteps_13[0].iloc[:, 1].tail(20),
)
ax = plt.plot(
    weekly_sarima_nsteps_13[1].iloc[:, 0], weekly_sarima_nsteps_13[1].iloc[:, 1]
)
plt.fill_between(
    weekly_sarima_nsteps_13[3].iloc[:, 0],
    weekly_sarima_nsteps_13[3].iloc[:, 1],
    weekly_sarima_nsteps_13[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()

Which is the best 1 month out model?¶

In [115]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# use the weekly SARIMA model to make monthly forecasts (4 weeks ~ 1 month)
weekly_SARIMA_nsteps_4 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date=weekly_measure_from,
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=4,
)
# use the weekly SARIMA model to make monthly forecasts (4 weeks ~ 1 month)
weekly_SARIMA_nsteps_4 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date=weekly_measure_from,
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=4,
)

100 % done

In [116]:

                
                    Copied!
                    
# to aggregate weekly forecasts into monthly ones we need to know how to distribute the weeks which overlap two months
# into each of the months correctly
distr = weekly_distribution(app_data, series_name)
# to aggregate weekly forecasts into monthly ones we need to know how to distribute the weeks which overlap two months
# into each of the months correctly
distr = weekly_distribution(app_data, series_name)

/Users/administrator/Documents/Milans_Forecasting_Repo/src/Model Selection/../Functions/aggregate_functions.py:35: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
  dummy_data = dummy_data.groupby(

In [117]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# aggregate the weekly forecasts into monthly ones
start = weekly_SARIMA_nsteps_4[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_4[0])
weekly_SARIMA_nsteps_4_trimmed = (
    weekly_SARIMA_nsteps_4[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_4_trimmed, distr)
# aggregate the weekly forecasts into monthly ones
start = weekly_SARIMA_nsteps_4[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_4[0])
weekly_SARIMA_nsteps_4_trimmed = (
    weekly_SARIMA_nsteps_4[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_4_trimmed, distr)

In [118]:

                
                    Copied!
                    
# see the performance by month
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
    monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)
# see the performance by month
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
    monthly_forecasts, how="inner"
)
pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)

In [119]:

                
                    Copied!
                    
# resample the data as monthly
ts_data_monthly = resample(preprocess(app_data, series_name), "M")
# resample the data as monthly
ts_data_monthly = resample(preprocess(app_data, series_name), "M")

In [120]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# run some other monthly models as a point of comparison and get their respective performances by month

monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date=monthly_measure_from,
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=1,
)
pm2 = performance_metrics(monthly_SARIMA[0], performance_lag=7, summarise=False)

monthly_lagged_av_1 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=1,
    nsteps=1,
)
pm3 = performance_metrics(monthly_lagged_av_1[0], performance_lag=7, summarise=False)

monthly_lagged_av_3 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=3,
    nsteps=1,
)
pm4 = performance_metrics(monthly_lagged_av_3[0], performance_lag=7, summarise=False)

monthly_simple_growth = rolling_forecast_LY_perc_inc(
    time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=1
)
pm5 = performance_metrics(monthly_simple_growth[0], performance_lag=7, summarise=False)

monthly_ensemble = rolling_forecast_ensemble([monthly_SARIMA, monthly_lagged_av_1])
pm6 = performance_metrics(monthly_ensemble[0], performance_lag=7, summarise=False)
# run some other monthly models as a point of comparison and get their respective performances by month

monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date=monthly_measure_from,
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=1,
)
pm2 = performance_metrics(monthly_SARIMA[0], performance_lag=7, summarise=False)

monthly_lagged_av_1 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=1,
    nsteps=1,
)
pm3 = performance_metrics(monthly_lagged_av_1[0], performance_lag=7, summarise=False)

monthly_lagged_av_3 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=3,
    nsteps=1,
)
pm4 = performance_metrics(monthly_lagged_av_3[0], performance_lag=7, summarise=False)

monthly_simple_growth = rolling_forecast_LY_perc_inc(
    time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=1
)
pm5 = performance_metrics(monthly_simple_growth[0], performance_lag=7, summarise=False)

monthly_ensemble = rolling_forecast_ensemble([monthly_SARIMA, monthly_lagged_av_1])
pm6 = performance_metrics(monthly_ensemble[0], performance_lag=7, summarise=False)

100 % done

In [121]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet = pickle.load(infile)
infile.close()
# read in the equivalent prophet outputs. Note the intialisation params, e.g. series names, weekly_measure_from = '2022-03-27' etc...
# also needs to be set in the prophet file, before running it to produce the outputs.

file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet = pickle.load(infile)
infile.close()

In [122]:

                
                    Copied!
                    
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet, pm6], "perc error")
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet, pm6], "perc error")

Out[122]:

	Daily	(0,1,1)(0,1,0)(52)_nsteps_4	(0,1,0)(0,1,0)(12)_nsteps_1	num_lags_1_nsteps_1	num_lags_3_nsteps_1	LY_growth_nsteps_1	prophet_nsteps_4	2_Model_Ensemble
0	2022-04-30	0.004359	0.013114	0.016558	0.120906	0.009585	0.006213	0.014836
1	2022-05-31	0.008269	0.024531	0.054608	0.103234	0.005714	0.001580	0.039570
2	2022-06-30	0.000893	0.042438	0.079931	0.046250	0.005164	0.009493	0.061184
3	2022-07-31	0.013536	0.055924	0.027824	0.062389	0.138324	0.025953	0.014050
4	2022-08-31	0.044651	0.013458	0.011424	0.057883	0.016108	0.007904	0.012441
5	2022-09-30	0.000153	0.031160	0.064928	0.083028	0.011586	0.002777	0.048044
6	2022-10-31	0.109748	0.031588	0.077994	0.034346	0.028391	0.121028	0.054791

Which is the best 3 month out model?¶

In [123]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# do exactly the same as before but now forecasting 3 months ahead instead of 1

weekly_SARIMA_nsteps_12 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date=weekly_measure_from,
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=12,
)
start = weekly_SARIMA_nsteps_12[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_12[0])
weekly_SARIMA_nsteps_12_trimmed = (
    weekly_SARIMA_nsteps_12[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_12_trimmed, distr)
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
    monthly_forecasts, how="inner"
)

pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)

_3_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date=monthly_measure_from,
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=3,
)
pm2 = performance_metrics(_3_monthly_SARIMA[0], performance_lag=7, summarise=False)

_3_monthly_lagged_av_1 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=1,
    nsteps=3,
)
pm3 = performance_metrics(_3_monthly_lagged_av_1[0], performance_lag=7, summarise=False)

_3_monthly_lagged_av_3 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=3,
    nsteps=3,
)
pm4 = performance_metrics(_3_monthly_lagged_av_3[0], performance_lag=7, summarise=False)


_3_monthly_simple_growth = rolling_forecast_LY_perc_inc(
    time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=3
)
pm5 = performance_metrics(
    _3_monthly_simple_growth[0], performance_lag=7, summarise=False
)

_3_monthly_ensemble = rolling_forecast_ensemble(
    [_3_monthly_SARIMA, _3_monthly_lagged_av_1]
)
pm6 = performance_metrics(_3_monthly_ensemble[0], performance_lag=7, summarise=False)
# do exactly the same as before but now forecasting 3 months ahead instead of 1

weekly_SARIMA_nsteps_12 = rolling_forecast_SARIMA(
    ts_data_weekly,
    window_end_date=weekly_measure_from,
    model_params=[0, 1, 1, 0, 1, 0, 52],
    bh_adj=False,
    bh_scale=False,
    nsteps=12,
)
start = weekly_SARIMA_nsteps_12[0].iloc[:, 2].first_valid_index()
end = len(weekly_SARIMA_nsteps_12[0])
weekly_SARIMA_nsteps_12_trimmed = (
    weekly_SARIMA_nsteps_12[0].iloc[start:end].drop(columns=series_name, axis=1)
)
monthly_forecasts = weekly_to_monthly_summary(weekly_SARIMA_nsteps_12_trimmed, distr)
monthly_forecasts_df = resample(preprocess(app_data, series_name), "M").merge(
    monthly_forecasts, how="inner"
)

pm1 = performance_metrics(monthly_forecasts_df, performance_lag=40, summarise=False)

_3_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date=monthly_measure_from,
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=3,
)
pm2 = performance_metrics(_3_monthly_SARIMA[0], performance_lag=7, summarise=False)

_3_monthly_lagged_av_1 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=1,
    nsteps=3,
)
pm3 = performance_metrics(_3_monthly_lagged_av_1[0], performance_lag=7, summarise=False)

_3_monthly_lagged_av_3 = rolling_forecast_lagged_average(
    time_series=ts_data_monthly,
    window_end_date=monthly_measure_from,
    lag=1,
    num_lags=3,
    nsteps=3,
)
pm4 = performance_metrics(_3_monthly_lagged_av_3[0], performance_lag=7, summarise=False)


_3_monthly_simple_growth = rolling_forecast_LY_perc_inc(
    time_series=ts_data_monthly, window_end_date=monthly_measure_from, nsteps=3
)
pm5 = performance_metrics(
    _3_monthly_simple_growth[0], performance_lag=7, summarise=False
)

_3_monthly_ensemble = rolling_forecast_ensemble(
    [_3_monthly_SARIMA, _3_monthly_lagged_av_1]
)
pm6 = performance_metrics(_3_monthly_ensemble[0], performance_lag=7, summarise=False)

100 % done

In [124]:

                
                    Copied!
                    
                        
                        
                    
                    

            
file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_3_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet_3 = pickle.load(infile)
infile.close()
file_path = (
    os.path.split(os.getcwd())[0]
    + "/Prophet Scripts/"
    + series_name
    + "_3_monthly_prophet_forecasts"
)
infile = open(file_path, "rb")
pm_prophet_3 = pickle.load(infile)
infile.close()

In [125]:

                
                    Copied!
                    
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet_3, pm6], "perc error")
performance_merger([pm1, pm2, pm3, pm4, pm5, pm_prophet_3, pm6], "perc error")

Out[125]:

	Daily	(0,1,1)(0,1,0)(52)_nsteps_12	(0,1,0)(0,1,0)(12)_nsteps_3	num_lags_1_nsteps_3	num_lags_3_nsteps_3	LY_growth_nsteps_3	prophet_nsteps_12	2_Model_Ensemble
0	2022-04-30	0.008070	0.013114	0.016558	0.120906	0.009585	0.007719	0.014836
1	2022-05-31	0.001587	0.012133	0.038955	0.141335	0.015353	0.002542	0.025544
2	2022-06-30	0.003918	0.029335	0.037863	0.045773	0.020596	0.008637	0.033599
3	2022-07-31	0.041859	0.055924	0.027824	0.062389	0.138324	0.031473	0.014050
4	2022-08-31	0.005589	0.043104	0.039566	0.078917	0.124444	0.004540	0.001769
5	2022-09-30	0.013250	0.014743	0.107063	0.133442	0.134589	0.004093	0.046160
6	2022-10-31	0.116619	0.031588	0.077994	0.034346	0.028391	0.121416	0.054791

A 3 month out forecast¶

In [126]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# take the monthly SARIMA model out 3 steps. Note this may not be the best weelky model for every series!

_3_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date="2022-10-31",
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=3,
)
# take the monthly SARIMA model out 3 steps. Note this may not be the best weelky model for every series!

_3_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date="2022-10-31",
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=3,
)

100 % done

In [127]:

                
                    Copied!
                    
                        
                        
                    
                    

            
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    _3_monthly_SARIMA[0].iloc[:, 0].tail(20), _3_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_3_monthly_SARIMA[1].iloc[:, 0], _3_monthly_SARIMA[1].iloc[:, 1])
plt.fill_between(
    _3_monthly_SARIMA[3].iloc[:, 0],
    _3_monthly_SARIMA[3].iloc[:, 1],
    _3_monthly_SARIMA[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    _3_monthly_SARIMA[0].iloc[:, 0].tail(20), _3_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_3_monthly_SARIMA[1].iloc[:, 0], _3_monthly_SARIMA[1].iloc[:, 1])
plt.fill_between(
    _3_monthly_SARIMA[3].iloc[:, 0],
    _3_monthly_SARIMA[3].iloc[:, 1],
    _3_monthly_SARIMA[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()

Longterm Forecasts¶

In [128]:

                
                    Copied!
                    
registered_users = read_file(user_data)
registered_users = read_file(user_data)

In [137]:

                
                    Copied!
                    
registered_users.head(5) #registered users data must have a date format of dd/mm/yyyy. The headers must also be the same as below.
registered_users.head(5) #registered users data must have a date format of dd/mm/yyyy. The headers must also be the same as below.

Out[137]:

	Date	TotalRegisteredUsers
0	31/12/2018	3280851.064
1	31/01/2019	4148936.170
2	28/02/2019	5004255.319
3	31/03/2019	5846808.511
4	30/04/2019	6676595.745

In [129]:

                
                    Copied!
                    
# get the dampened uptake data
uptake_data = dampen(registered_users, n=10)
# get the dampened uptake data
uptake_data = dampen(registered_users, n=10)

/Users/administrator/opt/miniconda3/envs/forecast-lab/lib/python3.9/site-packages/pandas/core/arraylike.py:402: RuntimeWarning: invalid value encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

In [130]:

                
                    Copied!
                    
uptake_data = uptake_data.drop(columns=["TotalRegisteredUsers"])
uptake_data = uptake_data.drop(columns=["TotalRegisteredUsers"])

In [131]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# take the monthly SARIMA model out 25 steps. Note this may not be the best weelky model for every series!

_24_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date="2022-10-31",
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=25,
)
# take the monthly SARIMA model out 25 steps. Note this may not be the best weelky model for every series!

_24_monthly_SARIMA = rolling_forecast_SARIMA(
    ts_data_monthly,
    window_end_date="2022-10-31",
    model_params=[0, 1, 0, 0, 1, 0, 12],
    bh_adj=False,
    bh_scale=False,
    nsteps=25,
)

100 % done

In [132]:

                
                    Copied!
                    
# dampen the forecasts out to account for the fact app uptake will slow over time.

trend_adj_forecast = trend_adjuster(
    _24_monthly_SARIMA[0], _24_monthly_SARIMA[1], uptake_data
)
# dampen the forecasts out to account for the fact app uptake will slow over time.

trend_adj_forecast = trend_adjuster(
    _24_monthly_SARIMA[0], _24_monthly_SARIMA[1], uptake_data
)

In [133]:

                
                    Copied!
                    
                        
                        
                    
                    

            
plt.figure(figsize=(12, 5))
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    _24_monthly_SARIMA[0].iloc[:, 0].tail(20), _24_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_24_monthly_SARIMA[1].iloc[:, 0], _24_monthly_SARIMA[1].iloc[:, 1])
ax = plt.plot(trend_adj_forecast.iloc[:, 0], trend_adj_forecast.iloc[:, 1])

plt.fill_between(
    _24_monthly_SARIMA[3].iloc[:, 0],
    _24_monthly_SARIMA[3].iloc[:, 1],
    _24_monthly_SARIMA[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()
plt.figure(figsize=(12, 5))
fig, ax = plt.subplots(figsize=(12, 5))

ax.set(title=series_name + " Forecasts by Month", xlabel="Date", ylabel=series_name)

ax = plt.plot(
    _24_monthly_SARIMA[0].iloc[:, 0].tail(20), _24_monthly_SARIMA[0].iloc[:, 1].tail(20)
)
ax = plt.plot(_24_monthly_SARIMA[1].iloc[:, 0], _24_monthly_SARIMA[1].iloc[:, 1])
ax = plt.plot(trend_adj_forecast.iloc[:, 0], trend_adj_forecast.iloc[:, 1])

plt.fill_between(
    _24_monthly_SARIMA[3].iloc[:, 0],
    _24_monthly_SARIMA[3].iloc[:, 1],
    _24_monthly_SARIMA[3].iloc[:, 2],
    color="r",
    alpha=0.1,
)

plt.show()

<Figure size 1200x500 with 0 Axes>

In [139]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# write all the above code to a py file but not this particular cell of code.

!jupyter nbconvert --to script pipeline_dummy.ipynb
with open("pipeline_dummy.py", "r") as f:
    lines = f.readlines()
with open("pipeline_dummy.py", "w") as f:
    for line in lines:
        if "nbconvert --to script" in line:
            break
        else:
            f.write(line)
# write all the above code to a py file but not this particular cell of code.

!jupyter nbconvert --to script pipeline_dummy.ipynb
with open("pipeline_dummy.py", "r") as f:
    lines = f.readlines()
with open("pipeline_dummy.py", "w") as f:
    for line in lines:
        if "nbconvert --to script" in line:
            break
        else:
            f.write(line)

[NbConvertApp] Converting notebook pipeline_dummy.ipynb to script
[NbConvertApp] Writing 19043 bytes to pipeline_dummy.py

In [ ]: