Browse Source

time series models implemented

Nikolaos Alexopoulos 7 years ago
parent
commit
e532be28c3
3 changed files with 108 additions and 17 deletions
  1. 3 1
      apt-sec.py
  2. 88 0
      lstm_reg.py
  3. 17 16
      machine_learning.py

+ 3 - 1
apt-sec.py

@@ -20,6 +20,7 @@ from dateutil import parser
 import plotly.plotly as py
 import plotly.graph_objs as go
 import machine_learning as ml
+import lstm_reg as lstm
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 ## Increase the recursion limit by much to allow bs to parse large files
@@ -748,7 +749,8 @@ if action == 'update':
 #    save_sha1lists()
     save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month)
     save_state(state)
-    ml.predict(src2month)
+#    ml.predict(src2month)
+    lstm.predict(src2month)
 elif action == 'status':
     load_DBs or exit(1)
     #handle errors more gracefully

+ 88 - 0
lstm_reg.py

@@ -0,0 +1,88 @@
+import numpy
+import matplotlib.pyplot as plt
+import pandas
+import math
+from keras.models import Sequential
+from keras.layers import Dense
+from keras.layers import LSTM
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+
+numpy.random.seed(7)
+
+# convert an array of values into a dataset matrix
+def create_dataset(dataset, look_back=1):
+	dataX, dataY = [], []
+	for i in range(len(dataset)-look_back-1):
+		a = dataset[i:(i+look_back-12)]
+		dataX.append(a)
+		dataY.append(dataset[i + look_back])
+	return numpy.array(dataX), numpy.array(dataY)
+
+def predict(src2month):
+
+    pkg_num = len(src2month)
+    training_num = len(src2month['linux'])-12
+    dataset = src2month['openjdk']
+    past = dataset[:len(dataset)-12]
+    dataset = pandas.rolling_mean(dataset, window=12)
+    dataset = dataset[12:]
+
+    # normalize the dataset
+    scaler = MinMaxScaler(feature_range=(0, 1))
+    dataset = scaler.fit_transform(dataset)
+
+    train_size = int(len(dataset) * 0.80)
+    test_size = len(dataset) - train_size
+    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
+    print(len(train), len(test))
+
+    # reshape into X=t and Y=t+1
+    look_back = 36
+    trainX, trainY = create_dataset(train, look_back)
+    testX, testY = create_dataset(test, look_back)
+
+    print(dataset)
+    print(testX, testY)
+
+    # reshape input to be [samples, time steps, features]
+    trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
+    testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
+
+    # create and fit the LSTM network
+    model = Sequential()
+    model.add(LSTM(4, input_dim=look_back-12))
+    model.add(Dense(1))
+    model.compile(loss='mean_squared_error', optimizer='adam')
+    model.fit(trainX, trainY, nb_epoch=100, batch_size=1, verbose=2)
+
+
+    # make predictions
+    trainPredict = model.predict(trainX)
+    testPredict = model.predict(testX)
+    print(type(testPredict))
+    # invert predictions
+    trainPredict = scaler.inverse_transform(trainPredict)
+    trainY = scaler.inverse_transform([trainY])
+    testPredict = scaler.inverse_transform(testPredict)
+    testY = scaler.inverse_transform([testY])
+    # calculate root mean squared error
+    trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
+    print('Train Score: %.2f RMSE' % (trainScore))
+    testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
+    print('Test Score: %.2f RMSE' % (testScore))
+
+
+    # shift train predictions for plotting
+    trainPredictPlot = numpy.empty_like(dataset)
+    trainPredictPlot[:] = numpy.nan
+    trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
+    # shift test predictions for plotting
+    testPredictPlot = numpy.empty_like(dataset)
+    testPredictPlot[:] = numpy.nan
+    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
+    # plot baseline and predictions
+    plt.plot(scaler.inverse_transform(dataset))
+    plt.plot(trainPredictPlot)
+    plt.plot(testPredictPlot)
+    plt.show()

+ 17 - 16
machine_learning.py

@@ -115,26 +115,24 @@ def predict(src2month):
 
 
     #Plot ACF: 
-    plt.subplot(121) 
-    plt.plot(lag_acf)
-    plt.axhline(y=0,linestyle='--',color='gray')
-    plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
-    plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
-    plt.title('Autocorrelation Function')
+#    plt.subplot(121) 
+#    plt.plot(lag_acf)
+#    plt.axhline(y=0,linestyle='--',color='gray')
+#    plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
+#    plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
+#    plt.title('Autocorrelation Function')
 
     #Plot PACF:
-    plt.subplot(122)
-    plt.plot(lag_pacf)
-    plt.axhline(y=0,linestyle='--',color='gray')
-    plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
-    plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
-    plt.title('Partial Autocorrelation Function')
-    plt.tight_layout()
+#    plt.subplot(122)
+#    plt.plot(lag_pacf)
+#    plt.axhline(y=0,linestyle='--',color='gray')
+#    plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
+#    plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
+#    plt.title('Partial Autocorrelation Function')
+#    plt.tight_layout()
 
 
-    plt.show()
-
-    mod = sm.tsa.statespace.sarimax.SARIMAX(past, trend='n', order=(2,1,1), seasonal_order=(1,1,1,12))
+    mod = sm.tsa.statespace.sarimax.SARIMAX(past, trend='n', order=(0,1,0), seasonal_order=(2,1,1,12))
     results = mod.fit()
     print(results.summary())
 
@@ -143,10 +141,13 @@ def predict(src2month):
     df['forecast'] = results.predict(start = len(past) + 1, end = len(past) + 102, dynamic= True)
     pred = np.concatenate((np.zeros(180), df['forecast']))
     
+    fitted = results.predict(start = 24, end = len(past) + 102, dynamic= True)
+    
     fig = plt.figure(figsize=(12,8))
     fig = plt.plot(data_rol, color='blue')
     pred = np.concatenate((np.zeros(12), pred))
     fig = plt.plot(pred, color='green')
+    fig = plt.plot(fitted, color='red')
     print(len(data), len(past), len(pred))
     reality = sum(data[193:205])
     average = sum(data[181:193])