/ EXERCISE

Time Series of Price Anomaly Detection with LSTM

Time Series of Price Anomaly Detection with LSTM

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import EarlyStopping


import warnings
warnings.filterwarnings('ignore')

Dataset

df = pd.read_csv('JNJ.csv')
df = df[['Date','Close']]
df['Date'] = pd.to_datetime(df['Date'])
plt.figure(figsize=(12,5))
plt.plot(df['Close'])
plt.show()

png

Preprocessing

train, test = df.loc[:11940], df.loc[11941:]
scaler = StandardScaler()
scaler = scaler.fit(train[['Close']])
train['Close'] = scaler.transform(train[['Close']])
test['Close'] = scaler.transform(test[['Close']])
TIME_STEPS = 30
def create_sequences(X, y, time_steps = TIME_STEPS):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        Xs.append(X.iloc[i:(i+time_steps)].values)
        ys.append(y.iloc[i+time_steps])
        
    return np.array(Xs), np.array(ys)
X_train, y_train = create_sequences(train[['Close']], train['Close'])
X_test, y_test = create_sequences(test[['Close']], test['Close'])
print(train.shape)
(11941, 2)
print(X_train.shape)
(11911, 30, 1)

Model

print(X_train.shape[1])
print(X_train.shape[2])
30
1
inputs = Input(shape = (X_train.shape[1], X_train.shape[2]))
x = LSTM(128)(inputs)
x = Dropout(rate=0.2)(x)
x = RepeatVector(X_train.shape[1])(x)

x = LSTM(128, return_sequences = True)(x)
x = Dropout(rate=0.2)(x)
# ###
# x = LSTM(128, return_sequences = True)(x)
# x = Dropout(rate=0.2)(x)
# ###

x = TimeDistributed(Dense(X_train.shape[2]))(x)
model = Model(inputs, x)
model.compile(optimizer='adam', loss='mse')
model.summary()
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 30, 1)]           0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
repeat_vector (RepeatVector) (None, 30, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 128)           131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 128)           0         
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 1)             129       
=================================================================
Total params: 198,273
Trainable params: 198,273
Non-trainable params: 0
_________________________________________________________________
es = EarlyStopping(monitor='val_loss', patience=3, mode='min')
hist = model.fit(X_train, y_train, epochs=100, batch_size=32,
                validation_split=0.2,
                callbacks=[es], shuffle=False)
Epoch 1/100
298/298 [==============================] - 2s 8ms/step - loss: 0.0130 - val_loss: 0.1263
Epoch 2/100
298/298 [==============================] - 2s 6ms/step - loss: 0.0077 - val_loss: 0.0452
Epoch 3/100
298/298 [==============================] - 2s 6ms/step - loss: 0.0041 - val_loss: 0.0080
Epoch 4/100
298/298 [==============================] - 2s 6ms/step - loss: 0.0029 - val_loss: 0.0098
Epoch 5/100
298/298 [==============================] - 2s 6ms/step - loss: 0.0022 - val_loss: 0.0576
Epoch 6/100
298/298 [==============================] - 2s 6ms/step - loss: 0.0028 - val_loss: 0.0241
plt.plot(hist.history['loss'], label='Training loss')
plt.plot(hist.history['val_loss'], label='Validation loss')
plt.legend()
plt.show()

png

Anomalies

X_train_pred = model.predict(X_train, verbose=0)
train_mae_loss = np.mean(np.abs(X_train_pred - X_train), axis=1)
plt.hist(train_mae_loss, bins=50)
plt.xlabel('Train MAE loss')
plt.ylabel('Number of Samples')
plt.show()

png

threshold = np.max(train_mae_loss)
print(f'Reconstruction error threshold: {threshold}')
Reconstruction error threshold: 0.55719959960999
X_test_pred = model.predict(X_test, verbose=0)
test_mae_loss = np.mean(np.abs(X_test_pred - X_test), axis=1)
plt.hist(test_mae_loss, bins=50)
plt.xlabel('Test MAE loss')
plt.ylabel('Number of samples')
plt.show()

png

test_score_df = pd.DataFrame(test[TIME_STEPS:])
test_score_df['loss'] = test_mae_loss
test_score_df['threshold'] = threshold
test_score_df['anomaly'] = test_score_df['loss'] > test_score_df['threshold']
test_score_df['Close'] = test[TIME_STEPS:]['Close']
plt.figure(figsize=(12,8))
plt.plot(test_score_df['Date'], test_score_df['loss'], label='Test loss')
plt.plot(test_score_df['Date'], test_score_df['threshold'], label='Threshold')
plt.legend()
plt.show()

png

anomalies = test_score_df.loc[test_score_df['anomaly'] == True]
anomalies.shape
(1970, 5)
plt.figure(figsize=(12,8))
plt.plot(test_score_df['Date'], scaler.inverse_transform(test_score_df['Close']),
        label = 'Close price')
plt.scatter(anomalies['Date'], scaler.inverse_transform(anomalies['Close']),
        label = 'Anomaly', c='r')
plt.legend()
plt.show()

png