123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773 |
- import numpy as np
- import matplotlib.pyplot as plt
- import pandas
- import math
- import sys
- from keras import backend as K
- from keras.models import Sequential
- from keras.layers import LSTM
- from keras.layers import GRU
- from keras import optimizers as opt
- from keras.layers import Dense
- from keras.layers import Activation, Dropout
- from keras.models import load_model
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- import copy
- import random
- import arimaPred as arim
- import paper_plots as carlosplt
- np.random.seed(7)
- EPSILON = 10 ** -12
- def moving_average(a, n) :
- ## code from Jaime
- ret = np.cumsum(a, dtype=float)
- ret[n:] = ret[n:] - ret[:-n]
- ret = np.concatenate((np.zeros(n-1), ret[n-1:]))
- return ret / n
- # convert an array of values into a dataset matrix
- # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
- def create_dataset(original_dataset, dataset, num_steps, look_back=1, feat_num=1, extra_features=[]):
- dataX, dataY = [], []
- print('Making training length :', len(original_dataset), ' ; ', len(dataset))
- for i in range(len(dataset)-look_back-num_steps +1):
- a = []
- for j in range(i, i+look_back):
- if feat_num>1:
- a.append([original_dataset[j]]+extra_features.tolist())
- else:
- a.append([original_dataset[j]])
- dataX.append(a)
- mean = 0
- for j in range(num_steps):
- mean += original_dataset[i+look_back+j]
- dataY.append(mean/num_steps)
- return np.array(dataX), np.array(dataY)
- def test_model(pkg_name, original_dataset, dataset, scaler, num_steps, smoothing, batch_num, look_back, test_size, mixed, feat_num=1, extra_features=[]):
- ## mixed is a boolean. When True, the model trained on all the packages is used. When false, each package has its own model.
-
- totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_features)
- K.clear_session()
- if mixed:
- new_model = load_model('./models/all_packages'+str(num_steps) + '.h5')
- else:
- new_model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
-
- new_model.reset_states()
- print(len(totalX))
- totalPredict = new_model.predict(totalX[len(totalX)-batch_num:], batch_size = batch_num)
- del new_model
-
- #scaler = scalers[pkg_name]
- totalPredict = scaler.inverse_transform(totalPredict)
- totalPredict = totalPredict.flatten()
- total_LSTM = [0] * (len(dataset)-len(totalPredict)) + totalPredict.tolist()
- #print(len(total_LSTM))
- #print(total_LSTM)
- #totalY = totalY[18:]
- totalY = totalY.reshape(-1,1)
- totalY = scaler.inverse_transform(totalY)
- totalY = totalY.flatten()
- temp = dataset.reshape(-1,1)
- temp = scaler.inverse_transform(temp)
- temp = temp.flatten()
-
- if (math.fabs(totalY[-1]-temp[-1])>EPSILON):
- print("Possible fault!!", totalY[-1], temp[-1])
- return total_LSTM[:]
- def predict_stationary(original, dataset, num_steps):
- prediction = dict()
- for pkg in dataset:
- data = dataset[pkg]
- pred = []
- i = 0
- for month in data:
- if i<num_steps:
- pred.append(0)
- else:
- pred.append(data[i-num_steps])
- i+=1
- prediction[pkg] = pred[:]
- return prediction
- def predict_average(original, dataset, num_steps):
- prediction = dict()
- for pkg in dataset:
- o_data = dataset[pkg]
- average_list = []
- i = 0
- j = i
- flag = True
- for month in o_data:
- if(month==0 and flag):
- average_list.append(0)
- i+=1
- j=i
- else:
- flag = False
- if(len(o_data[i:j])<=num_steps):
- average_list.append(0)
- else:
- # average up until before the time
- average = sum(o_data[i:j-num_steps])/len(o_data[i:j-num_steps])
- average_list.append(average)
- j += 1
- prediction[pkg]=average_list[:]
- return prediction
- def predict_Waverage(original, dataset, Lamd, num_steps):
- prediction = dict()
- for pkg in dataset:
- o_data = dataset[pkg]
- waverage_list = []
- i = 0
- j = i
- flag = True
- for month in o_data:
- if(month == 0 and flag):
- waverage_list.append(0)
- i += 1
- j = i
- else:
- flag = False
- if(len(o_data[i:j])<=num_steps):
- waverage_list.append(0)
- else:
- w_average = 0
- weights = 0
- jj = 0
- local = o_data[i:j-num_steps]
- for k in local:
- w_average += k * math.exp(-(len(local) - jj - 1)/Lamd)
- weights += math.exp(-(len(local) - jj - 1)/Lamd)
- jj += 1
- if (weights==0):
- w_average = 0
- else:
- w_average = w_average/weights
- waverage_list.append(w_average)
- j += 1
- prediction[pkg] = waverage_list[:]
- return prediction
- def train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler):
- model.set_weights(Wsave)
- ## Set the initial weights - remove if we want one model for all the packages
- train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
- train, test = data[0:train_size], data[train_size:len(data)]
- if (len(original_data) != len(data)):
- return(1)
- print('Training and test size in months: ', len(train), ' ; ', len(test))
- trainX, trainY = create_dataset(train_original, train, num_steps, look_back)
- testX, testY = create_dataset(test_original, test, num_steps, look_back)
-
- # reshape input to be [samples, time steps, features]
- trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- print(len(trainX), len(testX))
- print(len(trainY), len(testY))
-
- trainY.reshape(-1,1)
- testY.reshape(-1,1)
-
- fig = plt.gcf()
- fig.clf()
- fig.show()
- fig.canvas.draw()
- training_steps = 400*25
- trainerror = []
- valerror=[]
-
- for j in range (training_steps):
- model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
- model.reset_states()
- if(j%25==0):
- calc_errors(model, batch_num, original_data, data, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, [])
- plt.plot(trainerror, color='blue')
- plt.plot(valerror, color='red')
- plt.title(pkg_name)
- fig.canvas.draw()
- try:
- model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
- #model.save('./models/all_packages'+str(num_steps) + '.h5')
- del model
- except OSError:
- model.save('./models/unknown-' + str(num_steps) + '.h5')
- del model
- def train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler, extra_features):
- train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
- train, test = data[0:train_size], data[train_size:len(data)]
- if (len(original_data) != len(data)):
- return(1)
- print('Training and test size in months: ', len(train), ' ; ', len(test))
- trainX, trainY = create_dataset(train_original, train, num_steps, look_back, feat_num, extra_features)
- testX, testY = create_dataset(test_original, test, num_steps, look_back, feat_num, extra_features)
-
- # reshape input to be [samples, time steps, features]
- trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- trainY.reshape(-1,1)
- testY.reshape(-1,1)
-
- trainerror = []
- valerror = []
-
- model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
- model.reset_states()
- return 0
- def calc_errors(model, batch_num, original_dataset, dataset, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, extra_feature):
- totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_feature)
- totalX = totalX[len(totalX)-batch_num:]
- totalY = totalY[len(totalY)-batch_num:]
- model.reset_states()
- totalPredict = model.predict(totalX, batch_size = batch_num)
- totalPredict = scaler.inverse_transform(totalPredict)
- totalPredict = totalPredict.flatten()
-
- totalY = totalY.reshape(-1,1)
- totalY = scaler.inverse_transform(totalY)
- totalY = totalY.flatten()
-
- trainerror.append(mean_squared_error(totalPredict[50:-test_size], totalY[50:-test_size]))
- valerror.append(mean_squared_error(totalPredict[-test_size:], totalY[-test_size:]))
- return 0
- def calc_errors_all(model, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features):
- temp1 = 0
- temp2 = 0
- train_temp = []
- test_temp = []
- for pkg in data_in:
- scaler = scalers[pkg]
- calc_errors(model, batch_num, original_data_in[pkg], data_in[pkg], num_steps, look_back, test_size, pkg, train_temp, test_temp, scaler, feat_num, extra_features[pkg]) ** 2
- for errors in train_temp:
- temp1 += errors ** 2
-
- for errors in test_temp:
- temp2 += errors ** 2
- trainerror.append(math.sqrt(temp1)/len(data_in))
- valerror.append(math.sqrt(temp2)/len(data_in))
- return 0
- def predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train):
- # Model parameters
- # Do testing?
- do_test = True
- # Number of different models to train for
- models_num = 5
- # Look back steps
- look_back = 9
- # Number of lstm neurons
- num_neurons = look_back
- num_neurons2 = look_back
- # in case we want to add more features in the future
- feat_num = 1
-
- data_size = len(dataset[first])
- print(data_size)
- train_size = int(data_size - test_size)
- batch_num = train_size - num_steps - look_back + 1
- print("batch_num: ", batch_num)
- ## Create the NN with Keras
- model = Sequential()
- #model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
- model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True))
- #model.add(LSTM(num_neurons2, activation ='relu', dropout=0.4, stateful=True))
- model.add(Dense(1))
- Adam = opt.Adam(lr=0.001)
- model.compile(loss='mean_squared_error', optimizer=Adam)
- Wsave = model.get_weights()
- pred_LSTM = dict()
- data_in = dict()
- original_data_in = dict()
- scalers = dict()
-
- ## Let's start preparing our data
- for pkg in dataset:
- data = dataset[pkg]
- data = data.reshape(-1,1)
- scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
- scalers[pkg].fit(data)
- ## We use scaler for each package seperately
- data = scalers[pkg].transform(data)
- data = data.flatten()
- original_data = original[pkg]
- original_data = original_data.reshape(-1,1)
- original_data = scalers[pkg].transform(original_data)
- original_data = original_data.flatten()
- data_in[pkg] = data
- original_data_in[pkg] = original_data
- ## Compute the total number of reported vulnerabilities in case we need it later
- total_sum = sum(original[pkg])
- ## Let's start with the training - if we want to train ofc...
- if do_train:
- ## Just a test to have one LSTM for all packages
- for i in range(1):
- for pkg in dataset:
- ## random selection for mixed training
- ## CHANGE for dedicated models
- #pkg = random.choice(list(dataset.keys()))
- data = data_in[pkg]
- original_data = original_data_in[pkg]
- train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg, scalers[pkg])
-
- if do_test:
- for pkg in dataset:
- data = data_in[pkg]
- original_data = original_data_in[pkg]
- pred_LSTM[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, False)
-
- return pred_LSTM
- def predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train):
- # Model parameters
- do_test = True
- # Look back steps
- look_back = 9
- feat_num = 1+len(dataset)
- # Number of lstm neurons
- num_neurons = feat_num + 10
- num_neurons2 = look_back
- # in case we want to add more features in the future
- extra_features = dict()
- ## Training steps
- training_steps = 600*25
- ## Use one extra feature to signal package identity
- i = 0
- for pkg in dataset:
- extra_features[pkg] = np.asarray([0]*i + [1] + [0]*(len(dataset)-i-1))
- extra_features[pkg].reshape(-1,1)
-
- data_size = len(dataset[first])
- print(data_size)
- train_size = int(data_size - test_size)
- batch_num = train_size - num_steps - look_back + 1
- print("batch_num: ", batch_num)
- ## Create the NN with Keras
- model2 = Sequential()
- #model2.add(Dense(units=num_neurons, activation='relu', batch_input_shape=(batch_num, look_back,feat_num)))
- model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
- #model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.1, stateful=True))
- model2.add(LSTM(num_neurons, activation ='relu', dropout=0.1, stateful=True))
- #model2.add(Dense(num_neurons))
- model2.add(Dense(1))
- Adam = opt.Adam(lr=0.001)
- model2.compile(loss='mean_squared_error', optimizer=Adam)
- Wsave = model2.get_weights()
- pred_LSTM_all = dict()
- data_in = dict()
- original_data_in = dict()
- scalers = dict()
-
- ## Let's start preparing our data
- for pkg in dataset:
- data = dataset[pkg]
- data = data.reshape(-1,1)
- scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
- scalers[pkg].fit(data)
- ## We use scaler for each package seperately
- data = scalers[pkg].transform(data)
- data = data.flatten()
- original_data = original[pkg]
- original_data = original_data.reshape(-1,1)
- original_data = scalers[pkg].transform(original_data)
- original_data = original_data.flatten()
- data_in[pkg] = data
- original_data_in[pkg] = original_data
- ## Compute the total number of reported vulnerabilities in case we need it later
- total_sum = sum(original[pkg])
- ## Let's start with the training - if we want to train ofc...
- if do_train:
- ## Just a test to have one LSTM for all packages
- fig = plt.gcf()
- fig.clf()
- fig.show()
- fig.canvas.draw()
- trainerror = []
- valerror=[]
- for i in range(training_steps):
- ## random selection for mixed training
- pkg = random.choice(list(dataset.keys()))
- data = data_in[pkg]
- original_data = original_data_in[pkg]
- train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model2, Wsave, feat_num, look_back, pkg, scalers[pkg], extra_features[pkg])
- if(i%25==0):
- calc_errors_all(model2, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features)
- plt.plot(trainerror, color='blue')
- plt.plot(valerror, color='red')
- plt.title('Mixed')
- fig.canvas.draw()
- try:
- model2.save('./models/all_packages'+str(num_steps) + '.h5')
- del model2
- except OSError:
- model2.save('./models/unknown-' + str(num_steps) + '.h5')
-
- if do_test:
- for pkg in dataset:
- data = data_in[pkg]
- original_data = original_data_in[pkg]
- pred_LSTM_all[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, True, feat_num, extra_features[pkg])
- return pred_LSTM_all
-
- def load_dataset_smoothed(src2month, pkglist, smoothing):
- temp = dict()
- original = dict()
- for pkg in pkglist:
- temp1 = np.asarray(src2month[pkg])
- temp2 = temp1
- # Smooth the time-series with a moving average
- temp1 = moving_average(temp1, n=smoothing)
- temp1 = temp1[smoothing:]
- temp2 = temp2[smoothing:]
- ## Cut off leading part
- temp[pkg] = temp1
- original[pkg] = temp2
- return (original, temp)
- def print_all(data, pkglist):
- plt.figure(1)
- i = 1
- for pkg in pkglist:
- plt.subplot(2,5,i)
- plt.plot(data[pkg], label = pkg)
- plt.legend()
- i += 1
- plt.show()
- def print_pred(data, pred):
- plt.plot(data, color='blue', label='reality')
- plt.plot(pred, color='red', label='model')
- plt.legend()
- plt.show()
- def print_all_pred(data, pred, pkglist, test_size):
- carlosplt.pre_paper_plot(True)
- plt.figure(1)
- i = 1
- ## Build x axis
- quartersx = []
- for y in range(2,19):
- start = 1
- end = 5
- if y == 2:
- start = 2
- elif y == 19:
- end = 2
- for j in range(start,end):
- if j==1 and y%2==1:
- quartersx.append('\''+str(y).zfill(2))
- else:
- quartersx.append(' ')
- for pkg in pkglist:
- data_local = data[pkg]
- data_local = data_local.flatten()
- ax = plt.subplot(1,5,i)
- data_train = data_local[:-test_size]
- data_test = data_local[-test_size:]
- if pkg == 'firefox-esr':
- pkg_label = 'firefox'
- elif pkg == 'chromium-browser':
- pkg_label = 'chromium'
- elif pkg == 'openjdk-8':
- pkg_label = 'openjdk'
- else:
- pkg_label = pkg
- x_axis_train = []
- x_axis_test = []
- for j in range(len(data_train)):
- x_axis_train.append(j)
- for j in range(len(data_test)):
- x_axis_test.append(j+len(data_train))
- x_axis_all = x_axis_train + x_axis_test
-
- if i==5:
- train=ax.plot(x_axis_train, data_train, color = 'grey', label='real-tr')
- test=ax.plot(x_axis_test, data_test, color = 'coral', label = 'real-te')
- model=ax.plot(x_axis_all,pred[pkg], color ='blue', label='model')
- ax.legend()
- else:
- ax.plot(x_axis_train, data_train, color = 'grey')
- ax.plot(x_axis_test, data_test, color = 'coral')
- ax.plot(x_axis_all,pred[pkg], color ='blue')
- ax.spines['right'].set_visible(False)
- ax.spines['top'].set_visible(False)
- ax.set_title(pkg_label)
- plt.xticks(np.arange(1,len(pred[pkg]),3.0)+1,quartersx, rotation="vertical")
- i += 1
- carlosplt.post_paper_plot(True,True,True)
- plt.show()
- def normalize_errors(errors_list, pkglist, dataset, test_size):
- for pkg in errors_list:
- maximum = np.amax(dataset[pkg][-test_size:])
- minimum = np.amin(dataset[pkg][-test_size:])
- norm = maximum-minimum
- if norm<0.1:
- norm = 0.1
- for d in [errors_list]:
- d[pkg] = d[pkg]/norm
- def compute_errors_perpackage(prediction, dataset, test_size):
- temp_errors = dict()
- for pkg in prediction:
- temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][-test_size:], dataset[pkg][-test_size:]))
- return temp_errors
- def compute_errors_train_perpackage(prediction, dataset, test_size):
- temp_errors = dict()
- for pkg in prediction:
- temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][:-test_size], dataset[pkg][:-test_size]))
- return temp_errors
- def find_best_Lamd(original, dataset, num_steps, test_size):
- pred_Waverage_temp = dict()
- pred_Waverage = dict()
- errors = dict()
- best_errors = dict()
- best_lamdas = dict()
- dataset_temp = dict()
- pred_Waverage = predict_Waverage(original, dataset, 1, num_steps)
- errors[1] = compute_errors_train_perpackage(pred_Waverage, dataset, test_size)
- best_errors = errors[1]
-
- for pkg in dataset:
- best_lamdas[pkg] = 1
-
- for Lamd in range(1,100):
- pred_Waverage_temp = predict_Waverage(original, dataset, Lamd, num_steps)
- ## To compute best lamda
- errors[Lamd] = compute_errors_train_perpackage(pred_Waverage_temp, dataset, test_size)
- for pkg in pred_Waverage_temp:
- print(errors[Lamd][pkg])
- if errors[Lamd][pkg] < best_errors[pkg]:
- best_errors[pkg] = errors[Lamd][pkg]
- best_lamdas[pkg] = Lamd
- for pkg in dataset:
- pred_Waverage[pkg] = predict_Waverage(original, dataset, best_lamdas[pkg], num_steps)[pkg]
- print(best_lamdas)
- return pred_Waverage
- def do_training_errors(predictions_list, errors, dataset, test_size):
- ## First for each package
- for method in predictions_list:
- temp_dict = dict()
- try:
- temp_dict = compute_errors_train_perpackage(predictions_list[method], dataset, test_size)
- errors[method] = temp_dict
- except:
- print('Predictions missing')
- def do_testing_errors(predictions_list, errors, dataset, test_size):
-
- for method in predictions_list:
- temp_dict = dict()
- try:
- temp_dict = compute_errors_perpackage(predictions_list[method], dataset, test_size)
- errors[method] = temp_dict
- except:
- print('Predictions missing')
- def calculate_rmse(errors):
- temp = 0
- for pkg in errors:
- temp += errors[pkg]*errors[pkg]
- temp = math.sqrt(temp)/len(errors)
- return temp
- def calculate_mean(errors):
- temp = 0
- for pkg in errors:
- temp += errors[pkg]
- temp = temp/len(errors)
- return temp
- def print_summary(training_errors, testing_errors):
- print('#'*80)
- print('***** REPORT *****')
- for method in training_errors:
- print(method)
- print('Training Errors rmse: ', '%.3f' % training_errors[method]['rmse'])
- print('Testing Errors rmse: ', '%.3f' % testing_errors[method]['rmse'])
- print('Training Errors mean: ', '%.3f' %training_errors[method]['mean'])
- print('Testing Errors mean: ', '%.3f' %testing_errors[method]['mean'])
- print('#'*80)
-
- return 0
- def predict(src2month, k):
- pkglist=[]
- for pkg in src2month:
- if (sum(src2month[pkg])>50):
- pkglist.append(pkg)
-
- #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'wireshark', 'openjdk-8', 'mysql-transitional', 'php7.0', 'imagemagick', 'tcpdump']
- #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'openjdk-8']
- pkglist = ['icedove']
- first = pkglist[0]
- #pkglist = [first]
- # Number of months in the future
- num_steps = 9
- smoothing = num_steps
- # Test dataset size in months
- test_size = 18
- real_test_size = 18
- do_train = False
- dataset = dict()
- # Cut out end of 2018
- for pkg in pkglist:
- ## This is for training
- if do_train:
- src2month[pkg] = src2month[pkg][:-9-real_test_size]
- ## This is for experiments
- else:
- src2month[pkg] = src2month[pkg][test_size:-9]
- (original,dataset) = load_dataset_smoothed(src2month, pkglist, smoothing)
- # Each point of dataset is the mean of the same point and the previous smoothing-1 of the original
- num_packages = len(pkglist)
- # Print all smoothed time-series
- #print_all(dataset, pkglist)
-
- ## Make simple predictions (stationary, average, waverage)
- predictions_list = dict()
- predictions_list['stationary'] = predict_stationary(original, dataset, num_steps)
- predictions_list['average'] = predict_average(original, dataset, num_steps)
- predictions_list['Waverage'] = find_best_Lamd(original, dataset, num_steps, test_size)
- #predictions_list['LSTM'] = predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train)
- #predictions_list['LSTM_all'] = predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train)
-
- #print_all_pred(dataset, predictions_list['LSTM'], pkglist, test_size)
- #pkglist_new=['linux','firefox-esr', 'chromium-browser', 'icedove']
- pkglist_new = pkglist
- print_all_pred(dataset, predictions_list['Waverage'], pkglist_new, test_size)
- training_errors = dict()
- ## Dictionary of training errors e.g. training_errors['LSTM']['linux'] = XXX
- testing_errors = dict()
- ## Same for testing errors
- new_predictions_list = dict()
- ## For which packages to compute the error?
- for method in predictions_list:
- new_predictions_list[method] = dict()
- for pkg in predictions_list[method]:
- if (sum(src2month[pkg])>200):
- new_predictions_list[method][pkg] = predictions_list[method][pkg]
- print(new_predictions_list)
-
- do_training_errors(new_predictions_list, training_errors, dataset, test_size)
- do_testing_errors(new_predictions_list, testing_errors, dataset, test_size)
- ## Now among the packages again rmse. But first we normalize. Choose whether we want this or not
- for method in training_errors:
- normalize_errors(training_errors[method], pkglist, dataset, test_size)
- normalize_errors(testing_errors[method], pkglist, dataset, test_size)
- for pkg in training_errors['average']:
- print('#'*80)
- print(pkg)
- print('Training errors:')
- temp_list = []
- for method in training_errors:
- string = method + ': ' + str(training_errors[method][pkg]) + ' , '
- temp_list.append(string)
- print(temp_list)
-
- temp_list = []
- for method in training_errors:
- string = method + ': ' + str(testing_errors[method][pkg])
- temp_list.append(string)
- print('Testing errors:')
- print(temp_list)
- ## Now it is time for the rmse among the packages
- for method in testing_errors:
- testing_errors[method]['rmse'] = calculate_rmse(testing_errors[method])
- testing_errors[method]['mean'] = calculate_mean(testing_errors[method])
- training_errors[method]['rmse'] = calculate_rmse(training_errors[method])
- training_errors[method]['mean'] = calculate_mean(training_errors[method])
-
- print_summary(training_errors, testing_errors)
-
- return
|