/ TIMESERIES

Time Series Analysis - ARIMA

ARIMA Theory

import pandas as pd
import numpy as np
%matplotlib inline

df1 = pd.read_csv('Data/airline_passengers.csv', index_col='Month', parse_dates=True)
df1.index.freq = 'MS'

df2 = pd.read_csv('Data/DailyTotalFemaleBirths.csv', index_col='Date', parse_dates=True)
df2.index.freq = 'D'
from pmdarima import auto_arima
import warnings
warnings.filterwarnings('ignore')
stepwise_fit = auto_arima(df2['Births'], start_p=0, start_q=0, max_p=6, max_q=3, seasonal=False, trace=True)
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=2650.760, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=2565.234, Time=0.04 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=2463.584, Time=0.07 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=2648.768, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=2460.154, Time=0.13 sec
 ARIMA(2,1,1)(0,0,0)[0] intercept   : AIC=2461.271, Time=0.18 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.34 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=2460.722, Time=0.14 sec
 ARIMA(2,1,0)(0,0,0)[0] intercept   : AIC=2536.154, Time=0.09 sec
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=2462.864, Time=0.44 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=2459.074, Time=0.05 sec
 ARIMA(0,1,1)(0,0,0)[0]             : AIC=2462.221, Time=0.02 sec
 ARIMA(1,1,0)(0,0,0)[0]             : AIC=2563.261, Time=0.02 sec
 ARIMA(2,1,1)(0,0,0)[0]             : AIC=2460.367, Time=0.08 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=2460.427, Time=0.13 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=2459.571, Time=0.04 sec
 ARIMA(2,1,0)(0,0,0)[0]             : AIC=2534.205, Time=0.03 sec
 ARIMA(2,1,2)(0,0,0)[0]             : AIC=2462.366, Time=0.21 sec

Best model:  ARIMA(1,1,1)(0,0,0)[0]          
Total fit time: 2.041 seconds
stepwise_fit.summary()
SARIMAX Results
Dep. Variable: y No. Observations: 365
Model: SARIMAX(1, 1, 1) Log Likelihood -1226.537
Date: Fri, 20 Nov 2020 AIC 2459.074
Time: 14:27:52 BIC 2470.766
Sample: 0 HQIC 2463.721
- 365
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.1252 0.060 2.097 0.036 0.008 0.242
ma.L1 -0.9624 0.017 -56.429 0.000 -0.996 -0.929
sigma2 49.1512 3.250 15.122 0.000 42.781 55.522
Ljung-Box (Q): 37.21 Jarque-Bera (JB): 25.33
Prob(Q): 0.60 Prob(JB): 0.00
Heteroskedasticity (H): 0.96 Skew: 0.57
Prob(H) (two-sided): 0.81 Kurtosis: 3.60



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

stepwise_fit = auto_arima(df1['Thousands of Passengers'], start_p=0, start_q=0, max_p=4,max_q=4, seasonal=True, trace=True, m=12)
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(1,1,1)[12]             : AIC=1032.128, Time=0.14 sec
 ARIMA(0,1,0)(0,1,0)[12]             : AIC=1031.508, Time=0.01 sec
 ARIMA(1,1,0)(1,1,0)[12]             : AIC=1020.393, Time=0.06 sec
 ARIMA(0,1,1)(0,1,1)[12]             : AIC=1021.003, Time=0.10 sec
 ARIMA(1,1,0)(0,1,0)[12]             : AIC=1020.393, Time=0.02 sec
 ARIMA(1,1,0)(2,1,0)[12]             : AIC=1019.239, Time=0.19 sec
 ARIMA(1,1,0)(2,1,1)[12]             : AIC=inf, Time=1.14 sec
 ARIMA(1,1,0)(1,1,1)[12]             : AIC=1020.493, Time=0.21 sec
 ARIMA(0,1,0)(2,1,0)[12]             : AIC=1032.120, Time=0.12 sec
 ARIMA(2,1,0)(2,1,0)[12]             : AIC=1021.120, Time=0.23 sec
 ARIMA(1,1,1)(2,1,0)[12]             : AIC=1021.032, Time=0.32 sec
 ARIMA(0,1,1)(2,1,0)[12]             : AIC=1019.178, Time=0.18 sec
 ARIMA(0,1,1)(1,1,0)[12]             : AIC=1020.425, Time=0.07 sec
 ARIMA(0,1,1)(2,1,1)[12]             : AIC=inf, Time=1.11 sec
 ARIMA(0,1,1)(1,1,1)[12]             : AIC=1020.327, Time=0.25 sec
 ARIMA(0,1,2)(2,1,0)[12]             : AIC=1021.148, Time=0.20 sec
 ARIMA(1,1,2)(2,1,0)[12]             : AIC=1022.805, Time=0.38 sec
 ARIMA(0,1,1)(2,1,0)[12] intercept   : AIC=1021.017, Time=0.34 sec

Best model:  ARIMA(0,1,1)(2,1,0)[12]          
Total fit time: 5.096 seconds
stepwise_fit.summary()
SARIMAX Results
Dep. Variable: y No. Observations: 144
Model: SARIMAX(0, 1, 1)x(2, 1, [], 12) Log Likelihood -505.589
Date: Fri, 20 Nov 2020 AIC 1019.178
Time: 14:27:58 BIC 1030.679
Sample: 0 HQIC 1023.851
- 144
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ma.L1 -0.3634 0.074 -4.945 0.000 -0.508 -0.219
ar.S.L12 -0.1239 0.090 -1.372 0.170 -0.301 0.053
ar.S.L24 0.1911 0.107 1.783 0.075 -0.019 0.401
sigma2 130.4480 15.527 8.402 0.000 100.016 160.880
Ljung-Box (Q): 53.71 Jarque-Bera (JB): 4.59
Prob(Q): 0.07 Prob(JB): 0.10
Heteroskedasticity (H): 2.70 Skew: 0.15
Prob(H) (two-sided): 0.00 Kurtosis: 3.87



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

ARMA and ARIMA

import pandas as pd
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.arima_model import ARMA, ARIMA, ARMAResults, ARIMAResults
from statsmodels.tsa.statespace.sarimax import SARIMAX, SARIMAXResults
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from pmdarima.arima import auto_arima
df1 = pd.read_csv('Data/DailyTotalFemaleBirths.csv', index_col='Date',parse_dates=True)
df1.index.freq = 'D'
df1 = df1[:120]
df2 = pd.read_csv('Data/TradeInventories.csv',index_col='Date',parse_dates=True)
df2.index.freq='MS'

ARMA

df1['Births'].plot(figsize=(12,5))
<AxesSubplot:xlabel='Date'>

png

from statsmodels.tsa.stattools import adfuller

def adf_test(series, title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(), autolag = 'AIC')
    
    labels = ['ADF test statistic', 'p-value', '# lags used', '# obervations']
    out = pd.Series(result[0:4], index = labels)
    
    for key, val in result[4].items():
        out[f'critical value ({key})'] = val
        
    print(out.to_string())
    
    if result[1] <= 0.05:
        print('Strong evidence against the null hypothesis')
        print('Reject the null hypothesis')
        print('Data has no unit root and is stationary')
    else:
        print('Weak evidence against the null hypothesis')
        print('Fail to reject the null hypothesis')
        print('Data has a unit root and is non-stationary')
adf_test(df1['Births'])
Augmented Dickey-Fuller Test: 
ADF test statistic     -9.855384e+00
p-value                 4.373545e-17
# lags used             0.000000e+00
# obervations           1.190000e+02
critical value (1%)    -3.486535e+00
critical value (5%)    -2.886151e+00
critical value (10%)   -2.579896e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
auto_arima(df1['Births'], seasonal=False, trace=True).summary()
Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=inf, Time=0.14 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=1230.607, Time=0.00 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=896.926, Time=0.01 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=1121.103, Time=0.02 sec
 ARIMA(2,0,0)(0,0,0)[0]             : AIC=inf, Time=0.03 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=inf, Time=0.10 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(2,0,1)(0,0,0)[0]             : AIC=inf, Time=0.14 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=824.647, Time=0.02 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=823.489, Time=0.00 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=824.747, Time=0.03 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=826.399, Time=0.15 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 0.662 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 120
Model: SARIMAX Log Likelihood -409.745
Date: Mon, 23 Nov 2020 AIC 823.489
Time: 13:58:39 BIC 829.064
Sample: 0 HQIC 825.753
- 120
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 39.7833 0.687 57.896 0.000 38.437 41.130
sigma2 54.1197 8.319 6.506 0.000 37.815 70.424
Ljung-Box (Q): 44.41 Jarque-Bera (JB): 2.69
Prob(Q): 0.29 Prob(JB): 0.26
Heteroskedasticity (H): 0.80 Skew: 0.26
Prob(H) (two-sided): 0.48 Kurtosis: 2.48



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

train = df1.iloc[:90]
test = df1.iloc[90:]
model = ARMA(train['Births'], order=(2,2))
results = model.fit()
results.summary()
ARMA Model Results
Dep. Variable: Births No. Observations: 90
Model: ARMA(2, 2) Log Likelihood -307.905
Method: css-mle S.D. of innovations 7.405
Date: Mon, 23 Nov 2020 AIC 627.809
Time: 13:58:39 BIC 642.808
Sample: 01-01-1959 HQIC 633.858
- 03-31-1959
coef std err z P>|z| [0.025 0.975]
const 39.7549 0.912 43.607 0.000 37.968 41.542
ar.L1.Births -0.1850 1.087 -0.170 0.865 -2.315 1.945
ar.L2.Births 0.4352 0.644 0.675 0.500 -0.828 1.698
ma.L1.Births 0.2777 1.097 0.253 0.800 -1.872 2.427
ma.L2.Births -0.3999 0.679 -0.589 0.556 -1.730 0.930
Roots
Real Imaginary Modulus Frequency
AR.1 -1.3181 +0.0000j 1.3181 0.5000
AR.2 1.7433 +0.0000j 1.7433 0.0000
MA.1 -1.2718 +0.0000j 1.2718 0.5000
MA.2 1.9662 +0.0000j 1.9662 0.0000
start = len(train)
end = len(train) + len(test) - 1
predictions = results.predict(start, end).rename('ARMA (2,2) Predictions')
test['Births'].plot(figsize = (12,8), legend=True)
predictions.plot(legend=True)
<AxesSubplot:xlabel='Date'>

png

test.mean()
Births    39.833333
dtype: float64
predictions.mean()
39.777432092745876
df2.plot(figsize=(12,8))
<AxesSubplot:xlabel='Date'>

png

from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df2['Inventories'], model='add')
result.plot();

png

auto_arima(df2['Inventories'],seasonal=False, trace=True).summary()
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=5373.961, Time=0.31 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=5348.037, Time=0.04 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=5399.843, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=5350.241, Time=0.03 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=5409.217, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=5378.835, Time=0.14 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0] intercept
Total fit time: 0.562 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 264
Model: SARIMAX(0, 1, 0) Log Likelihood -2672.018
Date: Mon, 23 Nov 2020 AIC 5348.037
Time: 13:58:42 BIC 5355.181
Sample: 0 HQIC 5350.908
- 264
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 3258.3802 470.991 6.918 0.000 2335.255 4181.506
sigma2 3.91e+07 2.95e+06 13.250 0.000 3.33e+07 4.49e+07
Ljung-Box (Q): 455.75 Jarque-Bera (JB): 100.74
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.86 Skew: -1.15
Prob(H) (two-sided): 0.48 Kurtosis: 4.98



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

from statsmodels.tsa.statespace.tools import diff
df2['Diff_1'] = diff(df2['Inventories'], k_diff=1)
adf_test(df2['Diff_1'])
Augmented Dickey-Fuller Test: 
ADF test statistic       -3.412249
p-value                   0.010548
# lags used               4.000000
# obervations           258.000000
critical value (1%)      -3.455953
critical value (5%)      -2.872809
critical value (10%)     -2.572775
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary
df2
Inventories Diff_1
Date
1997-01-01 1301161 NaN
1997-02-01 1307080 5919.0
1997-03-01 1303978 -3102.0
1997-04-01 1319740 15762.0
1997-05-01 1327294 7554.0
... ... ...
2018-08-01 2127170 7552.0
2018-09-01 2134172 7002.0
2018-10-01 2144639 10467.0
2018-11-01 2143001 -1638.0
2018-12-01 2158115 15114.0

264 rows × 2 columns

plot_acf(df2['Inventories'], lags=40);

png

plot_pacf(df2['Inventories'], lags=40);

png

stepwise_fit = auto_arima(df2['Inventories'],start_p=0, start_q=0,
                          max_p=2, max_q=2, seasonal=False, trace=True)

stepwise_fit.summary()
Performing stepwise search to minimize aic
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=5348.037, Time=0.01 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=5399.843, Time=0.04 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=5350.241, Time=0.03 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=5409.217, Time=0.01 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=5378.835, Time=0.14 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0] intercept
Total fit time: 0.230 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 264
Model: SARIMAX(0, 1, 0) Log Likelihood -2672.018
Date: Fri, 20 Nov 2020 AIC 5348.037
Time: 14:28:12 BIC 5355.181
Sample: 0 HQIC 5350.908
- 264
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 3258.3802 470.991 6.918 0.000 2335.255 4181.506
sigma2 3.91e+07 2.95e+06 13.250 0.000 3.33e+07 4.49e+07
Ljung-Box (Q): 455.75 Jarque-Bera (JB): 100.74
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.86 Skew: -1.15
Prob(H) (two-sided): 0.48 Kurtosis: 4.98



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

train = df2.iloc[:252]
test = df2.iloc[252:]
train
Inventories Diff_1
Date
1997-01-01 1301161 NaN
1997-02-01 1307080 5919.0
1997-03-01 1303978 -3102.0
1997-04-01 1319740 15762.0
1997-05-01 1327294 7554.0
... ... ...
2017-08-01 2097163 11650.0
2017-09-01 2097753 590.0
2017-10-01 2095167 -2586.0
2017-11-01 2099137 3970.0
2017-12-01 2103751 4614.0

252 rows × 2 columns

model = ARIMA(train['Inventories'], order=(1,1,1))
results = model.fit()
results.summary()
ARIMA Model Results
Dep. Variable: D.Inventories No. Observations: 251
Model: ARIMA(1, 1, 1) Log Likelihood -2486.395
Method: css-mle S.D. of innovations 4845.028
Date: Thu, 15 Oct 2020 AIC 4980.790
Time: 13:01:13 BIC 4994.892
Sample: 02-01-1997 HQIC 4986.465
- 12-01-2017
coef std err z P>|z| [0.025 0.975]
const 3197.5697 1344.869 2.378 0.017 561.674 5833.465
ar.L1.D.Inventories 0.9026 0.039 23.010 0.000 0.826 0.979
ma.L1.D.Inventories -0.5581 0.079 -7.048 0.000 -0.713 -0.403
Roots
Real Imaginary Modulus Frequency
AR.1 1.1080 +0.0000j 1.1080 0.0000
MA.1 1.7918 +0.0000j 1.7918 0.0000
start = len(train)
end = len(train) + len(test) - 1

predictions = results.predict(start, end, typ='levels').rename('ARIMA(1,1,1) Predictions')
test['Inventories'].plot(figsize = (12,8), legend=True)
predictions.plot(legend=True)
<AxesSubplot:xlabel='Date'>

png

from statsmodels.tools.eval_measures import rmse
error = rmse(test['Inventories'], predictions)
error
7789.5973836593785
test['Inventories'].mean()
2125075.6666666665
predictions.mean()
2125465.272049339

FORECAST INTO UNKNOWN FUTURE

model = ARIMA(df2['Inventories'], order=(1,1,1))
results = model.fit()
fcast = results.predict(start =len(df2),end=len(df2)+11, typ='levels').rename('ARIMA (1,1,1) FORECAST')
df2['Inventories'].plot(figsize=(12,8), legend=True)
fcast.plot(legend=True)
<AxesSubplot:xlabel='Date'>

png

SARIMA

import pandas as pd
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
df = pd.read_csv('Data/co2_mm_mlo.csv')
df.head()
year month decimal_date average interpolated
0 1958 3 1958.208 315.71 315.71
1 1958 4 1958.292 317.45 317.45
2 1958 5 1958.375 317.50 317.50
3 1958 6 1958.458 NaN 317.10
4 1958 7 1958.542 315.86 315.86
# dict(year=df['year'], month=df['month'], day=1)

df['date'] = pd.to_datetime({'year':df['year'], 'month':df['month'],'day':1})
df.head()
year month decimal_date average interpolated date
0 1958 3 1958.208 315.71 315.71 1958-03-01
1 1958 4 1958.292 317.45 317.45 1958-04-01
2 1958 5 1958.375 317.50 317.50 1958-05-01
3 1958 6 1958.458 NaN 317.10 1958-06-01
4 1958 7 1958.542 315.86 315.86 1958-07-01
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   year          729 non-null    int64         
 1   month         729 non-null    int64         
 2   decimal_date  729 non-null    float64       
 3   average       722 non-null    float64       
 4   interpolated  729 non-null    float64       
 5   date          729 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(2)
memory usage: 34.3 KB
df = df.set_index('date')
df.head()
year month decimal_date average interpolated
date
1958-03-01 1958 3 1958.208 315.71 315.71
1958-04-01 1958 4 1958.292 317.45 317.45
1958-05-01 1958 5 1958.375 317.50 317.50
1958-06-01 1958 6 1958.458 NaN 317.10
1958-07-01 1958 7 1958.542 315.86 315.86
df.index.freq = 'MS'
df.head()
year month decimal_date average interpolated
date
1958-03-01 1958 3 1958.208 315.71 315.71
1958-04-01 1958 4 1958.292 317.45 317.45
1958-05-01 1958 5 1958.375 317.50 317.50
1958-06-01 1958 6 1958.458 NaN 317.10
1958-07-01 1958 7 1958.542 315.86 315.86
df['interpolated'].plot(figsize=(12,8))
<AxesSubplot:xlabel='date'>

png

result = seasonal_decompose(df['interpolated'], model='add')
result.plot();

png

result.seasonal.plot(figsize=(12,8))
<AxesSubplot:xlabel='date'>

png

auto_arima(df['interpolated'],seasonal=True,m=12).summary()
SARIMAX Results
Dep. Variable: y No. Observations: 729
Model: SARIMAX(2, 1, 1)x(1, 0, 1, 12) Log Likelihood -206.304
Date: Thu, 15 Oct 2020 AIC 424.609
Time: 13:24:28 BIC 452.151
Sample: 0 HQIC 435.236
- 729
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.3594 0.070 5.106 0.000 0.221 0.497
ar.L2 0.0895 0.030 2.979 0.003 0.031 0.148
ma.L1 -0.7122 0.061 -11.698 0.000 -0.832 -0.593
ar.S.L12 0.9996 0.000 2369.802 0.000 0.999 1.000
ma.S.L12 -0.8653 0.022 -39.952 0.000 -0.908 -0.823
sigma2 0.0957 0.005 20.453 0.000 0.087 0.105
Ljung-Box (Q): 43.95 Jarque-Bera (JB): 4.32
Prob(Q): 0.31 Prob(JB): 0.12
Heteroskedasticity (H): 1.12 Skew: -0.00
Prob(H) (two-sided): 0.36 Kurtosis: 3.38



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

len(df)
729
train = df.iloc[:717]
test = df.iloc[717:]
model = SARIMAX(train['interpolated'], order=(2,1,1),seasonal_order=(1,0,1,12))
results = model.fit()
results.summary()
SARIMAX Results
Dep. Variable: interpolated No. Observations: 717
Model: SARIMAX(2, 1, 1)x(1, 0, 1, 12) Log Likelihood -201.889
Date: Thu, 15 Oct 2020 AIC 415.778
Time: 13:28:51 BIC 443.220
Sample: 03-01-1958 HQIC 426.375
- 11-01-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.3458 0.083 4.154 0.000 0.183 0.509
ar.L2 0.0840 0.039 2.158 0.031 0.008 0.160
ma.L1 -0.7032 0.075 -9.432 0.000 -0.849 -0.557
ar.S.L12 0.9996 0.000 2879.070 0.000 0.999 1.000
ma.S.L12 -0.8653 0.023 -38.111 0.000 -0.910 -0.821
sigma2 0.0953 0.005 20.391 0.000 0.086 0.104
Ljung-Box (Q): 44.35 Jarque-Bera (JB): 4.68
Prob(Q): 0.29 Prob(JB): 0.10
Heteroskedasticity (H): 1.14 Skew: 0.01
Prob(H) (two-sided): 0.31 Kurtosis: 3.39



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

start = len(train)
end = len(train) + len(test) - 1
predictions = results.predict(start, end, typ='levels').rename('SARIMA Predictions')
test['interpolated'].plot(figsize=(12,8), legend=True)
predictions.plot(legend=True)
<AxesSubplot:xlabel='date'>

png

from statsmodels.tools.eval_measures import rmse
error = rmse(test['interpolated'], predictions)
error
0.3581857245313432
test['interpolated'].mean()
408.3333333333333

FORECAST INTO UNKWON FUTURE

model = SARIMAX(df['interpolated'], order=(2,1,1),seasonal_order=(1,0,1,12))
results = model.fit()
fcast = results.predict(len(df), len(df)+11, typ='levels').rename('SARIMA FORECAST')
df['interpolated'].plot(figsize=(12,8), legend=True)
fcast.plot(legend=True)
<AxesSubplot:xlabel='date'>

png

SARIMAX

import pandas as pd
import numpy as np
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('Data/RestaurantVisitors.csv',index_col='date',parse_dates=True)
df.index.freq = 'D'
df.head()
weekday holiday holiday_name rest1 rest2 rest3 rest4 total
date
2016-01-01 Friday 1 New Year's Day 65.0 25.0 67.0 139.0 296.0
2016-01-02 Saturday 0 na 24.0 39.0 43.0 85.0 191.0
2016-01-03 Sunday 0 na 24.0 31.0 66.0 81.0 202.0
2016-01-04 Monday 0 na 23.0 18.0 32.0 32.0 105.0
2016-01-05 Tuesday 0 na 2.0 15.0 38.0 43.0 98.0
df.tail()
weekday holiday holiday_name rest1 rest2 rest3 rest4 total
date
2017-05-27 Saturday 0 na NaN NaN NaN NaN NaN
2017-05-28 Sunday 0 na NaN NaN NaN NaN NaN
2017-05-29 Monday 1 Memorial Day NaN NaN NaN NaN NaN
2017-05-30 Tuesday 0 na NaN NaN NaN NaN NaN
2017-05-31 Wednesday 0 na NaN NaN NaN NaN NaN
df1 = df.dropna()
df1.tail()
weekday holiday holiday_name rest1 rest2 rest3 rest4 total
date
2017-04-18 Tuesday 0 na 30.0 30.0 13.0 18.0 91.0
2017-04-19 Wednesday 0 na 20.0 11.0 30.0 18.0 79.0
2017-04-20 Thursday 0 na 22.0 3.0 19.0 46.0 90.0
2017-04-21 Friday 0 na 38.0 53.0 36.0 38.0 165.0
2017-04-22 Saturday 0 na 97.0 20.0 50.0 59.0 226.0
df1.columns
Index(['weekday', 'holiday', 'holiday_name', 'rest1', 'rest2', 'rest3',
       'rest4', 'total'],
      dtype='object')
cols = ['rest1', 'rest2', 'rest3',
       'rest4', 'total']
for column in cols:
    df1[column] = df1[column].astype(int)
df1.head()
weekday holiday holiday_name rest1 rest2 rest3 rest4 total
date
2016-01-01 Friday 1 New Year's Day 65 25 67 139 296
2016-01-02 Saturday 0 na 24 39 43 85 191
2016-01-03 Sunday 0 na 24 31 66 81 202
2016-01-04 Monday 0 na 23 18 32 32 105
2016-01-05 Tuesday 0 na 2 15 38 43 98
df1['total'].plot(figsize=(16,5))
<AxesSubplot:xlabel='date'>

png

df1.query('holiday==1').index
DatetimeIndex(['2016-01-01', '2016-01-18', '2016-02-02', '2016-02-14',
               '2016-02-15', '2016-03-17', '2016-03-25', '2016-03-27',
               '2016-03-28', '2016-05-05', '2016-05-08', '2016-05-30',
               '2016-06-19', '2016-07-04', '2016-09-05', '2016-10-10',
               '2016-10-31', '2016-11-11', '2016-11-24', '2016-11-25',
               '2016-12-24', '2016-12-25', '2016-12-31', '2017-01-01',
               '2017-01-16', '2017-02-02', '2017-02-14', '2017-02-20',
               '2017-03-17', '2017-04-14', '2017-04-16', '2017-04-17'],
              dtype='datetime64[ns]', name='date', freq=None)
df1[df1['holiday']==1].index
DatetimeIndex(['2016-01-01', '2016-01-18', '2016-02-02', '2016-02-14',
               '2016-02-15', '2016-03-17', '2016-03-25', '2016-03-27',
               '2016-03-28', '2016-05-05', '2016-05-08', '2016-05-30',
               '2016-06-19', '2016-07-04', '2016-09-05', '2016-10-10',
               '2016-10-31', '2016-11-11', '2016-11-24', '2016-11-25',
               '2016-12-24', '2016-12-25', '2016-12-31', '2017-01-01',
               '2017-01-16', '2017-02-02', '2017-02-14', '2017-02-20',
               '2017-03-17', '2017-04-14', '2017-04-16', '2017-04-17'],
              dtype='datetime64[ns]', name='date', freq=None)
ax = df1['total'].plot(figsize=(16,5))

for day in df1.query('holiday==1').index:
    ax.axvline(x=day, color='k',alpha=.8);

png

from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df1['total'])
result.plot();

png

result.seasonal.plot(figsize=(18,5))
<AxesSubplot:xlabel='date'>

png

len(df1)
478
train = df1.iloc[:436]
test = df1.iloc[436:]
from pmdarima import auto_arima
auto_arima(df1['total'], seasonal=True, m=7, trace=True).summary()
Performing stepwise search to minimize aic


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(2,0,2)(1,0,1)[7] intercept   : AIC=inf, Time=1.72 sec
 ARIMA(0,0,0)(0,0,0)[7] intercept   : AIC=5269.484, Time=0.02 sec
 ARIMA(1,0,0)(1,0,0)[7] intercept   : AIC=4916.749, Time=0.46 sec
 ARIMA(0,0,1)(0,0,1)[7] intercept   : AIC=5049.644, Time=0.25 sec
 ARIMA(0,0,0)(0,0,0)[7]             : AIC=6126.084, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[7] intercept   : AIC=5200.790, Time=0.06 sec
 ARIMA(1,0,0)(2,0,0)[7] intercept   : AIC=4845.442, Time=1.05 sec
 ARIMA(1,0,0)(2,0,1)[7] intercept   : AIC=inf, Time=nan sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(1,0,1)[7] intercept   : AIC=4805.859, Time=0.63 sec
 ARIMA(1,0,0)(0,0,1)[7] intercept   : AIC=5058.642, Time=0.37 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(1,0,2)[7] intercept   : AIC=4948.132, Time=1.39 sec
 ARIMA(1,0,0)(0,0,2)[7] intercept   : AIC=4982.776, Time=0.64 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(2,0,2)[7] intercept   : AIC=inf, Time=1.78 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(0,0,0)(1,0,1)[7] intercept   : AIC=4780.356, Time=0.66 sec
 ARIMA(0,0,0)(0,0,1)[7] intercept   : AIC=5093.130, Time=0.12 sec
 ARIMA(0,0,0)(1,0,0)[7] intercept   : AIC=4926.360, Time=0.31 sec
 ARIMA(0,0,0)(2,0,1)[7] intercept   : AIC=inf, Time=nan sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(0,0,0)(1,0,2)[7] intercept   : AIC=4887.689, Time=1.07 sec
 ARIMA(0,0,0)(0,0,2)[7] intercept   : AIC=5010.582, Time=0.50 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(0,0,0)(2,0,0)[7] intercept   : AIC=4859.657, Time=1.26 sec
 ARIMA(0,0,0)(2,0,2)[7] intercept   : AIC=inf, Time=nan sec
 ARIMA(0,0,1)(1,0,1)[7] intercept   : AIC=inf, Time=0.80 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,1)(1,0,1)[7] intercept   : AIC=inf, Time=0.86 sec
 ARIMA(0,0,0)(1,0,1)[7]             : AIC=inf, Time=0.23 sec

Best model:  ARIMA(0,0,0)(1,0,1)[7] intercept
Total fit time: 17.200 seconds
SARIMAX Results
Dep. Variable: y No. Observations: 478
Model: SARIMAX(1, 0, [1], 7) Log Likelihood -2386.178
Date: Thu, 15 Oct 2020 AIC 4780.356
Time: 19:17:46 BIC 4797.035
Sample: 0 HQIC 4786.913
- 478
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 5.6944 1.886 3.020 0.003 1.998 9.391
ar.S.L7 0.9571 0.014 67.726 0.000 0.929 0.985
ma.S.L7 -0.7553 0.050 -15.001 0.000 -0.854 -0.657
sigma2 1244.1294 74.922 16.606 0.000 1097.285 1390.973
Ljung-Box (Q): 74.35 Jarque-Bera (JB): 61.36
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.85 Skew: 0.75
Prob(H) (two-sided): 0.30 Kurtosis: 3.92



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

from statsmodels.tsa.statespace.sarimax import SARIMAX
# ValueError: non-invertible starting MA parameters found

model = SARIMAX(train['total'], order=(1,0,0), seasonal_order=(2,0,0,7),
               enforce_invertibility=False)
model_c = SARIMAX(train['total'], order=(0,0,0), seasonal_order=(1,0,1,7),
               enforce_invertibility=False)
results = model.fit()
results_c = model_c.fit()
results.summary()
SARIMAX Results
Dep. Variable: total No. Observations: 436
Model: SARIMAX(1, 0, 0)x(2, 0, 0, 7) Log Likelihood -2224.701
Date: Thu, 15 Oct 2020 AIC 4457.403
Time: 19:17:47 BIC 4473.713
Sample: 01-01-2016 HQIC 4463.840
- 03-11-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.L1 0.2212 0.047 4.711 0.000 0.129 0.313
ar.S.L7 0.5063 0.036 14.187 0.000 0.436 0.576
ar.S.L14 0.4574 0.037 12.379 0.000 0.385 0.530
sigma2 1520.2899 82.277 18.478 0.000 1359.029 1681.550
Ljung-Box (Q): 83.96 Jarque-Bera (JB): 29.23
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.86 Skew: 0.34
Prob(H) (two-sided): 0.37 Kurtosis: 4.07



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

results_c.summary()
SARIMAX Results
Dep. Variable: total No. Observations: 436
Model: SARIMAX(1, 0, [1], 7) Log Likelihood -2165.369
Date: Thu, 15 Oct 2020 AIC 4336.738
Time: 19:17:47 BIC 4348.970
Sample: 01-01-2016 HQIC 4341.565
- 03-11-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
ar.S.L7 0.9999 9.57e-05 1.04e+04 0.000 1.000 1.000
ma.S.L7 -0.9384 0.024 -39.206 0.000 -0.985 -0.891
sigma2 1111.7950 58.740 18.927 0.000 996.667 1226.923
Ljung-Box (Q): 67.58 Jarque-Bera (JB): 83.57
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 0.96 Skew: 0.72
Prob(H) (two-sided): 0.81 Kurtosis: 4.59



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

start = len(train)
end = len(train) + len(test) - 1
predictions = results.predict(start, end).rename('SARIMA Model')
predictions_c = results_c.predict(start,end).rename('SARIMA Model_c')
ax = test['total'].plot(figsize=(15,8), legend=True)
predictions.plot(legend=True)
predictions_c.plot(legend=True)

for day in test.query('holiday==1').index:
    ax.axvline(x=day, color='k',alpha=.9);

png

from statsmodels.tools.eval_measures import rmse
print(rmse(test['total'], predictions))
print(rmse(test['total'], predictions_c))
print(test['total'].mean())
print(predictions.mean())
print(predictions_c.mean())
41.26315495065604
31.912579096986363
134.26190476190476
120.72598706500457
133.08493968389868
# df1[['holiday']]
auto_arima(df1['total'], exogenous=df1[['holiday']],seasonal=True,m=7, trace=True).summary()
Performing stepwise search to minimize aic


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(2,0,2)(1,0,1)[7] intercept   : AIC=inf, Time=1.47 sec
 ARIMA(0,0,0)(0,0,0)[7] intercept   : AIC=5235.582, Time=0.07 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(1,0,0)[7] intercept   : AIC=4804.111, Time=0.70 sec
 ARIMA(0,0,1)(0,0,1)[7] intercept   : AIC=4969.638, Time=0.51 sec
 ARIMA(0,0,0)(0,0,0)[7]             : AIC=6068.575, Time=0.07 sec
 ARIMA(1,0,0)(0,0,0)[7] intercept   : AIC=5171.193, Time=0.20 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(2,0,0)[7] intercept   : AIC=inf, Time=1.39 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(1,0,1)[7] intercept   : AIC=4728.053, Time=0.66 sec
 ARIMA(1,0,0)(0,0,1)[7] intercept   : AIC=4990.373, Time=0.48 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(2,0,1)[7] intercept   : AIC=inf, Time=1.41 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(1,0,2)[7] intercept   : AIC=4812.844, Time=1.24 sec
 ARIMA(1,0,0)(0,0,2)[7] intercept   : AIC=4887.554, Time=1.04 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,0)(2,0,2)[7] intercept   : AIC=inf, Time=1.65 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(0,0,0)(1,0,1)[7] intercept   : AIC=4829.768, Time=0.62 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(2,0,0)(1,0,1)[7] intercept   : AIC=5148.817, Time=0.97 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(1,0,1)(1,0,1)[7] intercept   : AIC=inf, Time=0.90 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(0,0,1)(1,0,1)[7] intercept   : AIC=4782.008, Time=0.79 sec


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)


 ARIMA(2,0,1)(1,0,1)[7] intercept   : AIC=inf, Time=1.02 sec
 ARIMA(1,0,0)(1,0,1)[7]             : AIC=inf, Time=0.57 sec

Best model:  ARIMA(1,0,0)(1,0,1)[7] intercept
Total fit time: 15.776 seconds


C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
SARIMAX Results
Dep. Variable: y No. Observations: 478
Model: SARIMAX(1, 0, 0)x(1, 0, [1], 7) Log Likelihood -2358.026
Date: Thu, 15 Oct 2020 AIC 4728.053
Time: 19:18:35 BIC 4753.070
Sample: 01-01-2016 HQIC 4737.888
- 04-22-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
intercept 19.0587 2.907 6.557 0.000 13.362 24.755
holiday 52.1120 3.981 13.089 0.000 44.309 59.915
ar.L1 0.1580 0.042 3.768 0.000 0.076 0.240
ar.S.L7 0.8269 0.024 34.166 0.000 0.779 0.874
ma.S.L7 -0.3437 0.056 -6.180 0.000 -0.453 -0.235
sigma2 971.3466 61.369 15.828 0.000 851.066 1091.628
Ljung-Box (Q): 109.04 Jarque-Bera (JB): 7.00
Prob(Q): 0.00 Prob(JB): 0.03
Heteroskedasticity (H): 0.91 Skew: 0.29
Prob(H) (two-sided): 0.55 Kurtosis: 3.13



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

Train out SARIMAX

model = SARIMAX(train['total'],exog=train[['holiday']],order=(1,0,1),
               seasonal_order=(1,0,1,7), enforce_invertibility=False)
model_c = SARIMAX(train['total'], exog=train[['holiday']], order=(1,0,0),
                 seasonal_order=(1,0,1,7), enforce_invertibility=False)
result = model.fit()
result_c = model_c.fit()
C:\Users\ilvna\.conda\envs\tf-2.3\lib\site-packages\statsmodels\base\model.py:568: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)
result.summary()
SARIMAX Results
Dep. Variable: total No. Observations: 436
Model: SARIMAX(1, 0, 1)x(1, 0, 1, 7) Log Likelihood -2175.956
Date: Thu, 15 Oct 2020 AIC 4363.911
Time: 19:19:25 BIC 4388.377
Sample: 01-01-2016 HQIC 4373.567
- 03-11-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
holiday 119.2207 3.207 37.177 0.000 112.935 125.506
ar.L1 0.9997 0.000 2763.806 0.000 0.999 1.000
ma.L1 -1.7406 0.089 -19.564 0.000 -1.915 -1.566
ar.S.L7 0.9997 0.001 1799.530 0.000 0.999 1.001
ma.S.L7 -1.0307 0.025 -41.893 0.000 -1.079 -0.982
sigma2 289.5149 20.075 14.422 0.000 250.169 328.861
Ljung-Box (Q): 77.56 Jarque-Bera (JB): 99.68
Prob(Q): 0.00 Prob(JB): 0.00
Heteroskedasticity (H): 1.05 Skew: -0.54
Prob(H) (two-sided): 0.77 Kurtosis: 5.08



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

result_c.summary()
SARIMAX Results
Dep. Variable: total No. Observations: 436
Model: SARIMAX(1, 0, 0)x(1, 0, [1], 7) Log Likelihood -2089.208
Date: Thu, 15 Oct 2020 AIC 4188.417
Time: 19:19:25 BIC 4208.805
Sample: 01-01-2016 HQIC 4196.463
- 03-11-2017
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
holiday 68.9355 3.773 18.271 0.000 61.541 76.330
ar.L1 0.2101 0.044 4.763 0.000 0.124 0.297
ar.S.L7 1.0000 5.78e-05 1.73e+04 0.000 1.000 1.000
ma.S.L7 -0.9581 0.022 -43.532 0.000 -1.001 -0.915
sigma2 779.3163 44.867 17.369 0.000 691.378 867.254
Ljung-Box (Q): 36.17 Jarque-Bera (JB): 20.47
Prob(Q): 0.64 Prob(JB): 0.00
Heteroskedasticity (H): 0.98 Skew: 0.22
Prob(H) (two-sided): 0.88 Kurtosis: 3.96



Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

start = len(train)
end = len(train) + len(test) - 1
predictions = result.predict(start, end, exog=test[['holiday']]).rename('SARIMAX with Exog')
predictions_c = result_c.predict(start, end, exog=test[['holiday']]).rename('SARIMAX with Exog c')
ax = test['total'].plot(figsize=(12,8), legend=True)
predictions.plot(legend=True)
predictions_c.plot(legend=True)


for day in test.query('holiday==1').index:
    ax.axvline(x=day, color='k',alpha=.9);

png

print(rmse(test['total'], predictions))
print(rmse(test['total'], predictions_c))
print(test['total'].mean())
print(predictions.mean())
print(predictions_c.mean())
49.92655128660368
22.92975136294346
134.26190476190476
176.6705414607891
135.4431981814327

FORECAST

model = SARIMAX(df1['total'],exog=df1[['holiday']], order=(1,0,1),
               seasonal_order=(1,0,1,7), enforce_invertibility=False)
model_c = SARIMAX(df1['total'], exog=df1[['holiday']], order=(1,0,0),
                 seasonal_order=(1,0,1,7), enforce_invertibility=False)
results = model.fit()
results_c = model_c.fit()
exog_forecast = df[478:][['holiday']]
fcast = results.predict(len(df1),len(df1)+38, exog = exog_forecast).rename('FINAL SARIMAX FORECAST')
fcast_c = results_c.predict(len(df1), len(df1)+38, exog = exog_forecast).rename('FINAL SARIMAX FORECAST_C')
ax = df1['total'].loc['2017-01-01':].plot(figsize=(15,8), legend=True)
fcast.plot(legend=True)
fcast_c.plot(legend=True)

for day in df.query('holiday==1').index:
    ax.axvline(x=day, color='k',alpha=.9);

png

from statsmodels.tools.eval_measures import rmse
rmse(fcast, fcast_c)
0.8548258765517648