|
@@ -0,0 +1,773 @@
|
|
|
+import numpy as np
|
|
|
+import matplotlib.pyplot as plt
|
|
|
+import pandas
|
|
|
+import math
|
|
|
+import sys
|
|
|
+from keras import backend as K
|
|
|
+from keras.models import Sequential
|
|
|
+from keras.layers import LSTM
|
|
|
+from keras.layers import GRU
|
|
|
+from keras import optimizers as opt
|
|
|
+from keras.layers import Dense
|
|
|
+from keras.layers import Activation, Dropout
|
|
|
+from keras.models import load_model
|
|
|
+from sklearn.preprocessing import MinMaxScaler
|
|
|
+from sklearn.metrics import mean_squared_error
|
|
|
+import copy
|
|
|
+import random
|
|
|
+import arimaPred as arim
|
|
|
+import paper_plots as carlosplt
|
|
|
+
|
|
|
+
|
|
|
+np.random.seed(7)
|
|
|
+EPSILON = 10 ** -12
|
|
|
+
|
|
|
+def moving_average(a, n) :
|
|
|
+ ## code from Jaime
|
|
|
+ ret = np.cumsum(a, dtype=float)
|
|
|
+ ret[n:] = ret[n:] - ret[:-n]
|
|
|
+ ret = np.concatenate((np.zeros(n-1), ret[n-1:]))
|
|
|
+ return ret / n
|
|
|
+
|
|
|
+
|
|
|
+# convert an array of values into a dataset matrix
|
|
|
+# ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
|
|
|
+def create_dataset(original_dataset, dataset, num_steps, look_back=1, feat_num=1, extra_features=[]):
|
|
|
+ dataX, dataY = [], []
|
|
|
+ print('Making training length :', len(original_dataset), ' ; ', len(dataset))
|
|
|
+ for i in range(len(dataset)-look_back-num_steps +1):
|
|
|
+ a = []
|
|
|
+ for j in range(i, i+look_back):
|
|
|
+ if feat_num>1:
|
|
|
+ a.append([original_dataset[j]]+extra_features.tolist())
|
|
|
+ else:
|
|
|
+ a.append([original_dataset[j]])
|
|
|
+ dataX.append(a)
|
|
|
+ mean = 0
|
|
|
+ for j in range(num_steps):
|
|
|
+ mean += original_dataset[i+look_back+j]
|
|
|
+
|
|
|
+
|
|
|
+ dataY.append(mean/num_steps)
|
|
|
+ return np.array(dataX), np.array(dataY)
|
|
|
+
|
|
|
+def test_model(pkg_name, original_dataset, dataset, scaler, num_steps, smoothing, batch_num, look_back, test_size, mixed, feat_num=1, extra_features=[]):
|
|
|
+ ## mixed is a boolean. When True, the model trained on all the packages is used. When false, each package has its own model.
|
|
|
+
|
|
|
+ totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_features)
|
|
|
+ K.clear_session()
|
|
|
+
|
|
|
+ if mixed:
|
|
|
+ new_model = load_model('./models/all_packages'+str(num_steps) + '.h5')
|
|
|
+ else:
|
|
|
+ new_model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
|
|
|
+
|
|
|
+ new_model.reset_states()
|
|
|
+ print(len(totalX))
|
|
|
+ totalPredict = new_model.predict(totalX[len(totalX)-batch_num:], batch_size = batch_num)
|
|
|
+ del new_model
|
|
|
+
|
|
|
+ #scaler = scalers[pkg_name]
|
|
|
+ totalPredict = scaler.inverse_transform(totalPredict)
|
|
|
+ totalPredict = totalPredict.flatten()
|
|
|
+ total_LSTM = [0] * (len(dataset)-len(totalPredict)) + totalPredict.tolist()
|
|
|
+ #print(len(total_LSTM))
|
|
|
+ #print(total_LSTM)
|
|
|
+
|
|
|
+ #totalY = totalY[18:]
|
|
|
+ totalY = totalY.reshape(-1,1)
|
|
|
+ totalY = scaler.inverse_transform(totalY)
|
|
|
+ totalY = totalY.flatten()
|
|
|
+
|
|
|
+ temp = dataset.reshape(-1,1)
|
|
|
+ temp = scaler.inverse_transform(temp)
|
|
|
+ temp = temp.flatten()
|
|
|
+
|
|
|
+ if (math.fabs(totalY[-1]-temp[-1])>EPSILON):
|
|
|
+ print("Possible fault!!", totalY[-1], temp[-1])
|
|
|
+
|
|
|
+
|
|
|
+ return total_LSTM[:]
|
|
|
+
|
|
|
+def predict_stationary(original, dataset, num_steps):
|
|
|
+ prediction = dict()
|
|
|
+ for pkg in dataset:
|
|
|
+ data = dataset[pkg]
|
|
|
+ pred = []
|
|
|
+ i = 0
|
|
|
+ for month in data:
|
|
|
+ if i<num_steps:
|
|
|
+ pred.append(0)
|
|
|
+ else:
|
|
|
+ pred.append(data[i-num_steps])
|
|
|
+ i+=1
|
|
|
+ prediction[pkg] = pred[:]
|
|
|
+ return prediction
|
|
|
+
|
|
|
+def predict_average(original, dataset, num_steps):
|
|
|
+ prediction = dict()
|
|
|
+ for pkg in dataset:
|
|
|
+ o_data = dataset[pkg]
|
|
|
+ average_list = []
|
|
|
+ i = 0
|
|
|
+ j = i
|
|
|
+ flag = True
|
|
|
+ for month in o_data:
|
|
|
+ if(month==0 and flag):
|
|
|
+ average_list.append(0)
|
|
|
+ i+=1
|
|
|
+ j=i
|
|
|
+ else:
|
|
|
+ flag = False
|
|
|
+ if(len(o_data[i:j])<=num_steps):
|
|
|
+ average_list.append(0)
|
|
|
+ else:
|
|
|
+ # average up until before the time
|
|
|
+ average = sum(o_data[i:j-num_steps])/len(o_data[i:j-num_steps])
|
|
|
+ average_list.append(average)
|
|
|
+ j += 1
|
|
|
+ prediction[pkg]=average_list[:]
|
|
|
+ return prediction
|
|
|
+
|
|
|
+def predict_Waverage(original, dataset, Lamd, num_steps):
|
|
|
+ prediction = dict()
|
|
|
+ for pkg in dataset:
|
|
|
+ o_data = dataset[pkg]
|
|
|
+ waverage_list = []
|
|
|
+ i = 0
|
|
|
+ j = i
|
|
|
+ flag = True
|
|
|
+ for month in o_data:
|
|
|
+ if(month == 0 and flag):
|
|
|
+ waverage_list.append(0)
|
|
|
+ i += 1
|
|
|
+ j = i
|
|
|
+ else:
|
|
|
+ flag = False
|
|
|
+ if(len(o_data[i:j])<=num_steps):
|
|
|
+ waverage_list.append(0)
|
|
|
+ else:
|
|
|
+ w_average = 0
|
|
|
+ weights = 0
|
|
|
+ jj = 0
|
|
|
+ local = o_data[i:j-num_steps]
|
|
|
+ for k in local:
|
|
|
+ w_average += k * math.exp(-(len(local) - jj - 1)/Lamd)
|
|
|
+ weights += math.exp(-(len(local) - jj - 1)/Lamd)
|
|
|
+ jj += 1
|
|
|
+ if (weights==0):
|
|
|
+ w_average = 0
|
|
|
+ else:
|
|
|
+ w_average = w_average/weights
|
|
|
+ waverage_list.append(w_average)
|
|
|
+ j += 1
|
|
|
+ prediction[pkg] = waverage_list[:]
|
|
|
+ return prediction
|
|
|
+
|
|
|
+def train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler):
|
|
|
+
|
|
|
+ model.set_weights(Wsave)
|
|
|
+ ## Set the initial weights - remove if we want one model for all the packages
|
|
|
+ train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
|
|
|
+ train, test = data[0:train_size], data[train_size:len(data)]
|
|
|
+ if (len(original_data) != len(data)):
|
|
|
+ return(1)
|
|
|
+ print('Training and test size in months: ', len(train), ' ; ', len(test))
|
|
|
+
|
|
|
+ trainX, trainY = create_dataset(train_original, train, num_steps, look_back)
|
|
|
+ testX, testY = create_dataset(test_original, test, num_steps, look_back)
|
|
|
+
|
|
|
+ # reshape input to be [samples, time steps, features]
|
|
|
+ trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
|
|
|
+ testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
|
|
|
+
|
|
|
+ print(len(trainX), len(testX))
|
|
|
+ print(len(trainY), len(testY))
|
|
|
+
|
|
|
+ trainY.reshape(-1,1)
|
|
|
+ testY.reshape(-1,1)
|
|
|
+
|
|
|
+ fig = plt.gcf()
|
|
|
+ fig.clf()
|
|
|
+ fig.show()
|
|
|
+ fig.canvas.draw()
|
|
|
+
|
|
|
+ training_steps = 400*25
|
|
|
+ trainerror = []
|
|
|
+ valerror=[]
|
|
|
+
|
|
|
+ for j in range (training_steps):
|
|
|
+ model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
|
|
|
+ model.reset_states()
|
|
|
+ if(j%25==0):
|
|
|
+ calc_errors(model, batch_num, original_data, data, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, [])
|
|
|
+ plt.plot(trainerror, color='blue')
|
|
|
+ plt.plot(valerror, color='red')
|
|
|
+ plt.title(pkg_name)
|
|
|
+ fig.canvas.draw()
|
|
|
+ try:
|
|
|
+ model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
|
|
|
+ #model.save('./models/all_packages'+str(num_steps) + '.h5')
|
|
|
+ del model
|
|
|
+ except OSError:
|
|
|
+ model.save('./models/unknown-' + str(num_steps) + '.h5')
|
|
|
+ del model
|
|
|
+
|
|
|
+def train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler, extra_features):
|
|
|
+
|
|
|
+ train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
|
|
|
+ train, test = data[0:train_size], data[train_size:len(data)]
|
|
|
+ if (len(original_data) != len(data)):
|
|
|
+ return(1)
|
|
|
+ print('Training and test size in months: ', len(train), ' ; ', len(test))
|
|
|
+
|
|
|
+ trainX, trainY = create_dataset(train_original, train, num_steps, look_back, feat_num, extra_features)
|
|
|
+ testX, testY = create_dataset(test_original, test, num_steps, look_back, feat_num, extra_features)
|
|
|
+
|
|
|
+ # reshape input to be [samples, time steps, features]
|
|
|
+ trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
|
|
|
+ testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
|
|
|
+
|
|
|
+ trainY.reshape(-1,1)
|
|
|
+ testY.reshape(-1,1)
|
|
|
+
|
|
|
+ trainerror = []
|
|
|
+ valerror = []
|
|
|
+
|
|
|
+ model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
|
|
|
+ model.reset_states()
|
|
|
+
|
|
|
+ return 0
|
|
|
+
|
|
|
+def calc_errors(model, batch_num, original_dataset, dataset, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, extra_feature):
|
|
|
+
|
|
|
+ totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_feature)
|
|
|
+
|
|
|
+ totalX = totalX[len(totalX)-batch_num:]
|
|
|
+ totalY = totalY[len(totalY)-batch_num:]
|
|
|
+
|
|
|
+ model.reset_states()
|
|
|
+ totalPredict = model.predict(totalX, batch_size = batch_num)
|
|
|
+
|
|
|
+ totalPredict = scaler.inverse_transform(totalPredict)
|
|
|
+ totalPredict = totalPredict.flatten()
|
|
|
+
|
|
|
+ totalY = totalY.reshape(-1,1)
|
|
|
+ totalY = scaler.inverse_transform(totalY)
|
|
|
+ totalY = totalY.flatten()
|
|
|
+
|
|
|
+ trainerror.append(mean_squared_error(totalPredict[50:-test_size], totalY[50:-test_size]))
|
|
|
+ valerror.append(mean_squared_error(totalPredict[-test_size:], totalY[-test_size:]))
|
|
|
+
|
|
|
+ return 0
|
|
|
+
|
|
|
+
|
|
|
+def calc_errors_all(model, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features):
|
|
|
+ temp1 = 0
|
|
|
+ temp2 = 0
|
|
|
+ train_temp = []
|
|
|
+ test_temp = []
|
|
|
+
|
|
|
+ for pkg in data_in:
|
|
|
+ scaler = scalers[pkg]
|
|
|
+ calc_errors(model, batch_num, original_data_in[pkg], data_in[pkg], num_steps, look_back, test_size, pkg, train_temp, test_temp, scaler, feat_num, extra_features[pkg]) ** 2
|
|
|
+
|
|
|
+ for errors in train_temp:
|
|
|
+ temp1 += errors ** 2
|
|
|
+
|
|
|
+ for errors in test_temp:
|
|
|
+ temp2 += errors ** 2
|
|
|
+
|
|
|
+ trainerror.append(math.sqrt(temp1)/len(data_in))
|
|
|
+ valerror.append(math.sqrt(temp2)/len(data_in))
|
|
|
+
|
|
|
+ return 0
|
|
|
+
|
|
|
+def predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train):
|
|
|
+ # Model parameters
|
|
|
+ # Do testing?
|
|
|
+ do_test = True
|
|
|
+ # Number of different models to train for
|
|
|
+ models_num = 5
|
|
|
+ # Look back steps
|
|
|
+ look_back = 9
|
|
|
+ # Number of lstm neurons
|
|
|
+ num_neurons = look_back
|
|
|
+ num_neurons2 = look_back
|
|
|
+ # in case we want to add more features in the future
|
|
|
+ feat_num = 1
|
|
|
+
|
|
|
+ data_size = len(dataset[first])
|
|
|
+ print(data_size)
|
|
|
+ train_size = int(data_size - test_size)
|
|
|
+ batch_num = train_size - num_steps - look_back + 1
|
|
|
+ print("batch_num: ", batch_num)
|
|
|
+
|
|
|
+ ## Create the NN with Keras
|
|
|
+ model = Sequential()
|
|
|
+ #model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
|
|
|
+ model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True))
|
|
|
+ #model.add(LSTM(num_neurons2, activation ='relu', dropout=0.4, stateful=True))
|
|
|
+ model.add(Dense(1))
|
|
|
+ Adam = opt.Adam(lr=0.001)
|
|
|
+ model.compile(loss='mean_squared_error', optimizer=Adam)
|
|
|
+ Wsave = model.get_weights()
|
|
|
+
|
|
|
+
|
|
|
+ pred_LSTM = dict()
|
|
|
+ data_in = dict()
|
|
|
+ original_data_in = dict()
|
|
|
+ scalers = dict()
|
|
|
+
|
|
|
+ ## Let's start preparing our data
|
|
|
+ for pkg in dataset:
|
|
|
+ data = dataset[pkg]
|
|
|
+ data = data.reshape(-1,1)
|
|
|
+ scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
|
|
|
+ scalers[pkg].fit(data)
|
|
|
+ ## We use scaler for each package seperately
|
|
|
+ data = scalers[pkg].transform(data)
|
|
|
+ data = data.flatten()
|
|
|
+ original_data = original[pkg]
|
|
|
+ original_data = original_data.reshape(-1,1)
|
|
|
+ original_data = scalers[pkg].transform(original_data)
|
|
|
+ original_data = original_data.flatten()
|
|
|
+ data_in[pkg] = data
|
|
|
+ original_data_in[pkg] = original_data
|
|
|
+
|
|
|
+ ## Compute the total number of reported vulnerabilities in case we need it later
|
|
|
+ total_sum = sum(original[pkg])
|
|
|
+
|
|
|
+ ## Let's start with the training - if we want to train ofc...
|
|
|
+ if do_train:
|
|
|
+ ## Just a test to have one LSTM for all packages
|
|
|
+ for i in range(1):
|
|
|
+ for pkg in dataset:
|
|
|
+ ## random selection for mixed training
|
|
|
+ ## CHANGE for dedicated models
|
|
|
+ #pkg = random.choice(list(dataset.keys()))
|
|
|
+ data = data_in[pkg]
|
|
|
+ original_data = original_data_in[pkg]
|
|
|
+ train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg, scalers[pkg])
|
|
|
+
|
|
|
+ if do_test:
|
|
|
+ for pkg in dataset:
|
|
|
+ data = data_in[pkg]
|
|
|
+ original_data = original_data_in[pkg]
|
|
|
+ pred_LSTM[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, False)
|
|
|
+
|
|
|
+ return pred_LSTM
|
|
|
+
|
|
|
+
|
|
|
+def predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train):
|
|
|
+ # Model parameters
|
|
|
+ do_test = True
|
|
|
+ # Look back steps
|
|
|
+ look_back = 9
|
|
|
+ feat_num = 1+len(dataset)
|
|
|
+ # Number of lstm neurons
|
|
|
+ num_neurons = feat_num + 10
|
|
|
+ num_neurons2 = look_back
|
|
|
+ # in case we want to add more features in the future
|
|
|
+ extra_features = dict()
|
|
|
+ ## Training steps
|
|
|
+ training_steps = 600*25
|
|
|
+
|
|
|
+ ## Use one extra feature to signal package identity
|
|
|
+ i = 0
|
|
|
+ for pkg in dataset:
|
|
|
+ extra_features[pkg] = np.asarray([0]*i + [1] + [0]*(len(dataset)-i-1))
|
|
|
+ extra_features[pkg].reshape(-1,1)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ data_size = len(dataset[first])
|
|
|
+ print(data_size)
|
|
|
+ train_size = int(data_size - test_size)
|
|
|
+ batch_num = train_size - num_steps - look_back + 1
|
|
|
+ print("batch_num: ", batch_num)
|
|
|
+
|
|
|
+ ## Create the NN with Keras
|
|
|
+ model2 = Sequential()
|
|
|
+ #model2.add(Dense(units=num_neurons, activation='relu', batch_input_shape=(batch_num, look_back,feat_num)))
|
|
|
+ model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
|
|
|
+ #model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.1, stateful=True))
|
|
|
+ model2.add(LSTM(num_neurons, activation ='relu', dropout=0.1, stateful=True))
|
|
|
+ #model2.add(Dense(num_neurons))
|
|
|
+ model2.add(Dense(1))
|
|
|
+ Adam = opt.Adam(lr=0.001)
|
|
|
+ model2.compile(loss='mean_squared_error', optimizer=Adam)
|
|
|
+ Wsave = model2.get_weights()
|
|
|
+
|
|
|
+
|
|
|
+ pred_LSTM_all = dict()
|
|
|
+ data_in = dict()
|
|
|
+ original_data_in = dict()
|
|
|
+ scalers = dict()
|
|
|
+
|
|
|
+ ## Let's start preparing our data
|
|
|
+ for pkg in dataset:
|
|
|
+ data = dataset[pkg]
|
|
|
+ data = data.reshape(-1,1)
|
|
|
+ scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
|
|
|
+ scalers[pkg].fit(data)
|
|
|
+ ## We use scaler for each package seperately
|
|
|
+ data = scalers[pkg].transform(data)
|
|
|
+ data = data.flatten()
|
|
|
+ original_data = original[pkg]
|
|
|
+ original_data = original_data.reshape(-1,1)
|
|
|
+ original_data = scalers[pkg].transform(original_data)
|
|
|
+ original_data = original_data.flatten()
|
|
|
+ data_in[pkg] = data
|
|
|
+ original_data_in[pkg] = original_data
|
|
|
+
|
|
|
+ ## Compute the total number of reported vulnerabilities in case we need it later
|
|
|
+ total_sum = sum(original[pkg])
|
|
|
+
|
|
|
+ ## Let's start with the training - if we want to train ofc...
|
|
|
+ if do_train:
|
|
|
+ ## Just a test to have one LSTM for all packages
|
|
|
+ fig = plt.gcf()
|
|
|
+ fig.clf()
|
|
|
+ fig.show()
|
|
|
+ fig.canvas.draw()
|
|
|
+ trainerror = []
|
|
|
+ valerror=[]
|
|
|
+ for i in range(training_steps):
|
|
|
+ ## random selection for mixed training
|
|
|
+ pkg = random.choice(list(dataset.keys()))
|
|
|
+ data = data_in[pkg]
|
|
|
+ original_data = original_data_in[pkg]
|
|
|
+ train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model2, Wsave, feat_num, look_back, pkg, scalers[pkg], extra_features[pkg])
|
|
|
+ if(i%25==0):
|
|
|
+ calc_errors_all(model2, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features)
|
|
|
+ plt.plot(trainerror, color='blue')
|
|
|
+ plt.plot(valerror, color='red')
|
|
|
+ plt.title('Mixed')
|
|
|
+ fig.canvas.draw()
|
|
|
+ try:
|
|
|
+ model2.save('./models/all_packages'+str(num_steps) + '.h5')
|
|
|
+ del model2
|
|
|
+ except OSError:
|
|
|
+ model2.save('./models/unknown-' + str(num_steps) + '.h5')
|
|
|
+
|
|
|
+ if do_test:
|
|
|
+ for pkg in dataset:
|
|
|
+ data = data_in[pkg]
|
|
|
+ original_data = original_data_in[pkg]
|
|
|
+ pred_LSTM_all[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, True, feat_num, extra_features[pkg])
|
|
|
+
|
|
|
+ return pred_LSTM_all
|
|
|
+
|
|
|
+
|
|
|
+def load_dataset_smoothed(src2month, pkglist, smoothing):
|
|
|
+ temp = dict()
|
|
|
+ original = dict()
|
|
|
+ for pkg in pkglist:
|
|
|
+ temp1 = np.asarray(src2month[pkg])
|
|
|
+ temp2 = temp1
|
|
|
+ # Smooth the time-series with a moving average
|
|
|
+ temp1 = moving_average(temp1, n=smoothing)
|
|
|
+ temp1 = temp1[smoothing:]
|
|
|
+ temp2 = temp2[smoothing:]
|
|
|
+ ## Cut off leading part
|
|
|
+ temp[pkg] = temp1
|
|
|
+ original[pkg] = temp2
|
|
|
+ return (original, temp)
|
|
|
+
|
|
|
+
|
|
|
+def print_all(data, pkglist):
|
|
|
+ plt.figure(1)
|
|
|
+ i = 1
|
|
|
+ for pkg in pkglist:
|
|
|
+ plt.subplot(2,5,i)
|
|
|
+ plt.plot(data[pkg], label = pkg)
|
|
|
+ plt.legend()
|
|
|
+ i += 1
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+def print_pred(data, pred):
|
|
|
+ plt.plot(data, color='blue', label='reality')
|
|
|
+ plt.plot(pred, color='red', label='model')
|
|
|
+ plt.legend()
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+def print_all_pred(data, pred, pkglist, test_size):
|
|
|
+ carlosplt.pre_paper_plot(True)
|
|
|
+ plt.figure(1)
|
|
|
+ i = 1
|
|
|
+
|
|
|
+ ## Build x axis
|
|
|
+ quartersx = []
|
|
|
+
|
|
|
+ for y in range(2,19):
|
|
|
+ start = 1
|
|
|
+ end = 5
|
|
|
+ if y == 2:
|
|
|
+ start = 2
|
|
|
+ elif y == 19:
|
|
|
+ end = 2
|
|
|
+ for j in range(start,end):
|
|
|
+ if j==1 and y%2==1:
|
|
|
+ quartersx.append('\''+str(y).zfill(2))
|
|
|
+ else:
|
|
|
+ quartersx.append(' ')
|
|
|
+
|
|
|
+ for pkg in pkglist:
|
|
|
+ data_local = data[pkg]
|
|
|
+ data_local = data_local.flatten()
|
|
|
+ ax = plt.subplot(1,5,i)
|
|
|
+ data_train = data_local[:-test_size]
|
|
|
+ data_test = data_local[-test_size:]
|
|
|
+ if pkg == 'firefox-esr':
|
|
|
+ pkg_label = 'firefox'
|
|
|
+ elif pkg == 'chromium-browser':
|
|
|
+ pkg_label = 'chromium'
|
|
|
+ elif pkg == 'openjdk-8':
|
|
|
+ pkg_label = 'openjdk'
|
|
|
+ else:
|
|
|
+ pkg_label = pkg
|
|
|
+ x_axis_train = []
|
|
|
+ x_axis_test = []
|
|
|
+ for j in range(len(data_train)):
|
|
|
+ x_axis_train.append(j)
|
|
|
+ for j in range(len(data_test)):
|
|
|
+ x_axis_test.append(j+len(data_train))
|
|
|
+ x_axis_all = x_axis_train + x_axis_test
|
|
|
+
|
|
|
+ if i==5:
|
|
|
+ train=ax.plot(x_axis_train, data_train, color = 'grey', label='real-tr')
|
|
|
+ test=ax.plot(x_axis_test, data_test, color = 'coral', label = 'real-te')
|
|
|
+ model=ax.plot(x_axis_all,pred[pkg], color ='blue', label='model')
|
|
|
+ ax.legend()
|
|
|
+ else:
|
|
|
+ ax.plot(x_axis_train, data_train, color = 'grey')
|
|
|
+ ax.plot(x_axis_test, data_test, color = 'coral')
|
|
|
+ ax.plot(x_axis_all,pred[pkg], color ='blue')
|
|
|
+ ax.spines['right'].set_visible(False)
|
|
|
+ ax.spines['top'].set_visible(False)
|
|
|
+ ax.set_title(pkg_label)
|
|
|
+ plt.xticks(np.arange(1,len(pred[pkg]),3.0)+1,quartersx, rotation="vertical")
|
|
|
+ i += 1
|
|
|
+ carlosplt.post_paper_plot(True,True,True)
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+def normalize_errors(errors_list, pkglist, dataset, test_size):
|
|
|
+ for pkg in errors_list:
|
|
|
+ maximum = np.amax(dataset[pkg][-test_size:])
|
|
|
+ minimum = np.amin(dataset[pkg][-test_size:])
|
|
|
+ norm = maximum-minimum
|
|
|
+ if norm<0.1:
|
|
|
+ norm = 0.1
|
|
|
+ for d in [errors_list]:
|
|
|
+ d[pkg] = d[pkg]/norm
|
|
|
+
|
|
|
+def compute_errors_perpackage(prediction, dataset, test_size):
|
|
|
+ temp_errors = dict()
|
|
|
+ for pkg in prediction:
|
|
|
+ temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][-test_size:], dataset[pkg][-test_size:]))
|
|
|
+ return temp_errors
|
|
|
+
|
|
|
+def compute_errors_train_perpackage(prediction, dataset, test_size):
|
|
|
+ temp_errors = dict()
|
|
|
+ for pkg in prediction:
|
|
|
+ temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][:-test_size], dataset[pkg][:-test_size]))
|
|
|
+ return temp_errors
|
|
|
+
|
|
|
+def find_best_Lamd(original, dataset, num_steps, test_size):
|
|
|
+ pred_Waverage_temp = dict()
|
|
|
+ pred_Waverage = dict()
|
|
|
+ errors = dict()
|
|
|
+ best_errors = dict()
|
|
|
+ best_lamdas = dict()
|
|
|
+ dataset_temp = dict()
|
|
|
+
|
|
|
+ pred_Waverage = predict_Waverage(original, dataset, 1, num_steps)
|
|
|
+ errors[1] = compute_errors_train_perpackage(pred_Waverage, dataset, test_size)
|
|
|
+ best_errors = errors[1]
|
|
|
+
|
|
|
+ for pkg in dataset:
|
|
|
+ best_lamdas[pkg] = 1
|
|
|
+
|
|
|
+ for Lamd in range(1,100):
|
|
|
+ pred_Waverage_temp = predict_Waverage(original, dataset, Lamd, num_steps)
|
|
|
+ ## To compute best lamda
|
|
|
+ errors[Lamd] = compute_errors_train_perpackage(pred_Waverage_temp, dataset, test_size)
|
|
|
+ for pkg in pred_Waverage_temp:
|
|
|
+ print(errors[Lamd][pkg])
|
|
|
+ if errors[Lamd][pkg] < best_errors[pkg]:
|
|
|
+ best_errors[pkg] = errors[Lamd][pkg]
|
|
|
+ best_lamdas[pkg] = Lamd
|
|
|
+
|
|
|
+ for pkg in dataset:
|
|
|
+ pred_Waverage[pkg] = predict_Waverage(original, dataset, best_lamdas[pkg], num_steps)[pkg]
|
|
|
+
|
|
|
+ print(best_lamdas)
|
|
|
+
|
|
|
+
|
|
|
+ return pred_Waverage
|
|
|
+
|
|
|
+def do_training_errors(predictions_list, errors, dataset, test_size):
|
|
|
+ ## First for each package
|
|
|
+
|
|
|
+ for method in predictions_list:
|
|
|
+ temp_dict = dict()
|
|
|
+ try:
|
|
|
+ temp_dict = compute_errors_train_perpackage(predictions_list[method], dataset, test_size)
|
|
|
+ errors[method] = temp_dict
|
|
|
+ except:
|
|
|
+ print('Predictions missing')
|
|
|
+
|
|
|
+
|
|
|
+def do_testing_errors(predictions_list, errors, dataset, test_size):
|
|
|
+
|
|
|
+ for method in predictions_list:
|
|
|
+ temp_dict = dict()
|
|
|
+ try:
|
|
|
+ temp_dict = compute_errors_perpackage(predictions_list[method], dataset, test_size)
|
|
|
+ errors[method] = temp_dict
|
|
|
+ except:
|
|
|
+ print('Predictions missing')
|
|
|
+
|
|
|
+def calculate_rmse(errors):
|
|
|
+ temp = 0
|
|
|
+ for pkg in errors:
|
|
|
+ temp += errors[pkg]*errors[pkg]
|
|
|
+
|
|
|
+ temp = math.sqrt(temp)/len(errors)
|
|
|
+
|
|
|
+ return temp
|
|
|
+
|
|
|
+def calculate_mean(errors):
|
|
|
+ temp = 0
|
|
|
+ for pkg in errors:
|
|
|
+ temp += errors[pkg]
|
|
|
+
|
|
|
+ temp = temp/len(errors)
|
|
|
+
|
|
|
+ return temp
|
|
|
+
|
|
|
+def print_summary(training_errors, testing_errors):
|
|
|
+ print('#'*80)
|
|
|
+ print('***** REPORT *****')
|
|
|
+ for method in training_errors:
|
|
|
+ print(method)
|
|
|
+ print('Training Errors rmse: ', '%.3f' % training_errors[method]['rmse'])
|
|
|
+ print('Testing Errors rmse: ', '%.3f' % testing_errors[method]['rmse'])
|
|
|
+ print('Training Errors mean: ', '%.3f' %training_errors[method]['mean'])
|
|
|
+ print('Testing Errors mean: ', '%.3f' %testing_errors[method]['mean'])
|
|
|
+ print('#'*80)
|
|
|
+
|
|
|
+ return 0
|
|
|
+
|
|
|
+def predict(src2month, k):
|
|
|
+
|
|
|
+ pkglist=[]
|
|
|
+
|
|
|
+ for pkg in src2month:
|
|
|
+ if (sum(src2month[pkg])>50):
|
|
|
+ pkglist.append(pkg)
|
|
|
+
|
|
|
+
|
|
|
+ #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'wireshark', 'openjdk-8', 'mysql-transitional', 'php7.0', 'imagemagick', 'tcpdump']
|
|
|
+ #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'openjdk-8']
|
|
|
+ pkglist = ['icedove']
|
|
|
+ first = pkglist[0]
|
|
|
+ #pkglist = [first]
|
|
|
+ # Number of months in the future
|
|
|
+ num_steps = 9
|
|
|
+ smoothing = num_steps
|
|
|
+ # Test dataset size in months
|
|
|
+ test_size = 18
|
|
|
+ real_test_size = 18
|
|
|
+ do_train = False
|
|
|
+
|
|
|
+ dataset = dict()
|
|
|
+ # Cut out end of 2018
|
|
|
+ for pkg in pkglist:
|
|
|
+ ## This is for training
|
|
|
+ if do_train:
|
|
|
+ src2month[pkg] = src2month[pkg][:-9-real_test_size]
|
|
|
+ ## This is for experiments
|
|
|
+ else:
|
|
|
+ src2month[pkg] = src2month[pkg][test_size:-9]
|
|
|
+
|
|
|
+
|
|
|
+ (original,dataset) = load_dataset_smoothed(src2month, pkglist, smoothing)
|
|
|
+ # Each point of dataset is the mean of the same point and the previous smoothing-1 of the original
|
|
|
+ num_packages = len(pkglist)
|
|
|
+
|
|
|
+ # Print all smoothed time-series
|
|
|
+ #print_all(dataset, pkglist)
|
|
|
+
|
|
|
+ ## Make simple predictions (stationary, average, waverage)
|
|
|
+ predictions_list = dict()
|
|
|
+ predictions_list['stationary'] = predict_stationary(original, dataset, num_steps)
|
|
|
+ predictions_list['average'] = predict_average(original, dataset, num_steps)
|
|
|
+ predictions_list['Waverage'] = find_best_Lamd(original, dataset, num_steps, test_size)
|
|
|
+ #predictions_list['LSTM'] = predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train)
|
|
|
+ #predictions_list['LSTM_all'] = predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train)
|
|
|
+
|
|
|
+ #print_all_pred(dataset, predictions_list['LSTM'], pkglist, test_size)
|
|
|
+ #pkglist_new=['linux','firefox-esr', 'chromium-browser', 'icedove']
|
|
|
+ pkglist_new = pkglist
|
|
|
+ print_all_pred(dataset, predictions_list['Waverage'], pkglist_new, test_size)
|
|
|
+
|
|
|
+ training_errors = dict()
|
|
|
+ ## Dictionary of training errors e.g. training_errors['LSTM']['linux'] = XXX
|
|
|
+ testing_errors = dict()
|
|
|
+ ## Same for testing errors
|
|
|
+
|
|
|
+ new_predictions_list = dict()
|
|
|
+
|
|
|
+ ## For which packages to compute the error?
|
|
|
+ for method in predictions_list:
|
|
|
+ new_predictions_list[method] = dict()
|
|
|
+ for pkg in predictions_list[method]:
|
|
|
+ if (sum(src2month[pkg])>200):
|
|
|
+ new_predictions_list[method][pkg] = predictions_list[method][pkg]
|
|
|
+
|
|
|
+
|
|
|
+ print(new_predictions_list)
|
|
|
+
|
|
|
+ do_training_errors(new_predictions_list, training_errors, dataset, test_size)
|
|
|
+ do_testing_errors(new_predictions_list, testing_errors, dataset, test_size)
|
|
|
+
|
|
|
+ ## Now among the packages again rmse. But first we normalize. Choose whether we want this or not
|
|
|
+
|
|
|
+ for method in training_errors:
|
|
|
+ normalize_errors(training_errors[method], pkglist, dataset, test_size)
|
|
|
+ normalize_errors(testing_errors[method], pkglist, dataset, test_size)
|
|
|
+
|
|
|
+ for pkg in training_errors['average']:
|
|
|
+ print('#'*80)
|
|
|
+ print(pkg)
|
|
|
+ print('Training errors:')
|
|
|
+ temp_list = []
|
|
|
+ for method in training_errors:
|
|
|
+ string = method + ': ' + str(training_errors[method][pkg]) + ' , '
|
|
|
+ temp_list.append(string)
|
|
|
+
|
|
|
+ print(temp_list)
|
|
|
+
|
|
|
+ temp_list = []
|
|
|
+ for method in training_errors:
|
|
|
+ string = method + ': ' + str(testing_errors[method][pkg])
|
|
|
+ temp_list.append(string)
|
|
|
+
|
|
|
+ print('Testing errors:')
|
|
|
+ print(temp_list)
|
|
|
+
|
|
|
+ ## Now it is time for the rmse among the packages
|
|
|
+ for method in testing_errors:
|
|
|
+ testing_errors[method]['rmse'] = calculate_rmse(testing_errors[method])
|
|
|
+ testing_errors[method]['mean'] = calculate_mean(testing_errors[method])
|
|
|
+ training_errors[method]['rmse'] = calculate_rmse(training_errors[method])
|
|
|
+ training_errors[method]['mean'] = calculate_mean(training_errors[method])
|
|
|
+
|
|
|
+
|
|
|
+ print_summary(training_errors, testing_errors)
|
|
|
+
|
|
|
+ return
|