import numpy import matplotlib.pyplot as plt import pandas import math from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-1-num_steps): a = [] for j in range(i, i+look_back): #a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) mean = 0 for j in range(num_steps): mean += original_dataset[i+look_back+j] dataY.append(mean/num_steps) return numpy.array(dataX), numpy.array(dataY) ## Calculate weighted average for comparison def calc_waverage(raw_av, lamda_w): w_average = 0 weights = 0 if(len(raw_av) == 0): w_average = 0 else: jj = 0 for j in raw_av: w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w) weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w) jj += 1 w_average = w_average/weights return w_average def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 ## Model parameters do_train = False num_steps = 9 smoothing = num_steps num_neurons = 10 look_back = 3 train_flag = True test_flag = True lamda_w = 12 init_test_size = 18 pkg_num = len(src2month) training_num = len(src2month['linux']-4) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() train_size = int(len(src2month['linux']) - init_test_size) test_size = len(src2month['linux']) - train_size batch_num = train_size - num_steps - look_back - smoothing - 5 print("batch_num:") print(batch_num) # create the LSTM network model = Sequential() model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True)) # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False)) # model.add(Dense(32, activation='relu')) # model.add(Dense(16, activation='relu')) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') Wsave = model.get_weights() scaler = MinMaxScaler(feature_range=(0, 1)) #scaler2 = MinMaxScaler(feature_range=(0,1)) #scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] # for i in src2month: # test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) #test_scale = [] #for i in src2pop: # test_scale.append(src2pop[i][1]) #scaler2.fit(test_scale) #test_scale = [] #for i in src2sloccount: # test_scale.append(src2sloccount[i][0]) #scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### # for pkg_name in ['chromium-browser']: for i in range(1): # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: for pkg_name in src2month: pkg_num = len(src2month) dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-4] print(dataset.shape) dataset = dataset.reshape(-1,1) scaler.fit(dataset) print(dataset.shape) dataset = dataset.flatten() print(dataset.shape) print(len(dataset)) # if (sum(dataset)>30): # reset or not between training model.set_weights(Wsave) original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) ## ommit for rolling mean original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] # normalize the dataset dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() original_dataset = original_dataset.reshape(-1,1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() train_size = len(dataset) - init_test_size test_size = len(dataset) - train_size train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)] train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] #try: # pop_vote = src2pop[pkg_name][1] #except KeyError: # pop_vote = 0 #try: # slocs_total = src2sloccount[pkg_name][0] #except KeyError: # slocs_total = 0 #pop_vote = scaler2.transform([[pop_vote]]) #slocs_total = scaler3.transform([[slocs_total]]) #meta.append(pop_vote) #meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back) testX, testY = create_dataset(test_original, test, meta, num_steps, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) trainY.reshape(-1,1) testY.reshape(-1,1) #if len(total_trainX) == 0: # total_trainX = trainX # total_trainY = trainY #else: # total_trainX = numpy.concatenate((total_trainX, trainX)) # total_trainY = numpy.concatenate((total_trainY, trainY)) # save to dict for later trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY testXdict[pkg_name], testYdict[pkg_name] = testX, testY # fit the LSTM network if do_train: for i in range (20000): model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False) model.reset_states() try: model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') except OSError: model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') #else: # try: # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # except OSError: # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') ################################################################################################### # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w') target2 = open('results_paper' + str(num_steps) + '.txt','w') # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: # for pkg_name in ['libpng']: for pkg_name in src2month: dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-4] dataset = dataset.reshape(-1,1) scaler.fit(dataset) dataset = dataset.flatten() if (sum(dataset)>30 and test_flag): dataset = pandas.rolling_mean(dataset, window=smoothing) #if (sum(dataset)>80): model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') #else: # model = load_model('./models/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') dataset = dataset[smoothing:] model.reset_states() trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name] testX, testY = testXdict[pkg_name], testYdict[pkg_name] print(trainX.shape, trainY.shape, testX.shape, testY.shape) # # normalize the dataset dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() print(numpy.shape(trainX), numpy.shape(testX), numpy.shape(dataset)) # make predictions trainPredict = model.predict(trainX, batch_size = batch_num) model.reset_states() test_dataset = numpy.concatenate((trainX[len(testX):], testX)) testPredict = model.predict(test_dataset, batch_size = batch_num) model.reset_states() # invert predictions testPredict = testPredict[-len(testX):] trainPredict = trainPredict.reshape(-1,1) trainPredict = scaler.inverse_transform(trainPredict) trainPredict = trainPredict.flatten() trainY = trainY.reshape(-1,1) trainY = scaler.inverse_transform(trainY) trainY = trainY.flatten() testPredict = testPredict.reshape(-1,1) testPredict = scaler.inverse_transform(testPredict) testPredict = testPredict.flatten() print('shapes coming') print(testY.shape) testY = testY.reshape(-1,1) testY = scaler.inverse_transform(testY) testY = testY.flatten() print(testY.shape) ## Calculate average for comparison raw_av = src2month[pkg_name] raw_av = raw_av[:len(dataset)-4] i = 0 averagear = numpy.empty_like(trainY) w_averagear = numpy.empty_like(trainY) averagear[0] = 0 w_averagear[0] = 0 for i in range(1,len(averagear)): averagear[i] = sum(raw_av[:i])/float(i) w_averagear[i] = calc_waverage(raw_av[:i], lamda_w) averagear_test = numpy.empty_like(testY) w_averagear_test = numpy.empty_like(testY) for i in range(0, len(averagear_test)): averagear_test[i] = sum(raw_av[:len(averagear) + i])/float(len(averagear) + i) w_averagear_test[i] = calc_waverage(raw_av[:len(averagear+i)], lamda_w) print(pkg_name) #print('average: ' + str(num_steps*average) + '\nweighted average: ' + str(num_steps*w_average)) ## calculate root mean squared error # LSTM trainScore = math.sqrt(mean_squared_error(trainY, trainPredict)) #print('Train Score: %.2f RMSE' % (trainScore)) testScore = math.sqrt(mean_squared_error(testY, testPredict)) #print('Test Score: %.2f RMSE' % (testScore)) # Average trainScore_average = math.sqrt(mean_squared_error(trainY, averagear)) testScore_average = math.sqrt(mean_squared_error(testY, averagear_test)) trainScore_w_average = math.sqrt(mean_squared_error(trainY, w_averagear)) testScore_w_average = math.sqrt(mean_squared_error(testY, w_averagear_test)) # Immitation imit_train = numpy.copy(trainY) imit_train = imit_train[:-1] #print(imit_train) trainY_im = trainY[1:] #print(trainY_im) imit_test = numpy.copy(testY) imit_test = imit_test[:-1] testY_im = testY[1:] trainScore_imit = math.sqrt(mean_squared_error(trainY_im, imit_train)) testScore_imit = math.sqrt(mean_squared_error(testY_im, imit_test)) # Calculate nrmse for certainty nmax_train = numpy.amax(trainY[-24:]) nmin_train = numpy.amin(trainY[-24:]) nmax_test = numpy.amax(testY) nmin_test = numpy.amin(testY) nmax = 0 nmin = 0 if(nmax_train > nmax_test): nmax = nmax_train else: nmax = nmax_test if(nmin_train < nmin_test): nmin = nmin_train else: nmin = nmin_test normalizer = nmax - nmin print('nmax: ' + str(nmax) + ' , nmin: ' + str(nmin) + ' , normalizer: ' + str(normalizer)) # plot baseline and predictions #print(numpy.shape(trainY), numpy.shape(testY)) #print(numpy.shape(trainPredict), numpy.shape(testPredict)) real_values = numpy.concatenate((trainY, testY), axis=0) training_fit = numpy.empty_like(real_values) training_fit[:] = numpy.nan training_fit[:len(trainPredict)] = trainPredict prediction = numpy.empty_like(real_values) prediction[:] = numpy.nan prediction[len(trainPredict):] = testPredict print('Actual number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(real_values[-1]*num_steps)) act = real_values[-1]*num_steps print('Predicted number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(prediction[-1]*num_steps)) lstm_pred = prediction[-1]*num_steps av_pred = averagear_test[-1]*num_steps w_av_pred = w_averagear_test[-1]*num_steps #print(real_values) plt.plot(real_values) plt.plot(training_fit) plt.plot(prediction) plt.show() ## Caclulate better predictor best = -1 min_test_error = min(testScore, testScore_average, testScore_w_average) if min_test_error == testScore: best = 0 elif min_test_error == testScore_average: best = 1 elif min_test_error == testScore_w_average: best = 2 print("LSTM rmse: " + str(testScore)) print("Average rmse: " + str(testScore_average)) print("Weighted average rmse: " + str(testScore_w_average)) # ## Write to file #target.write(pkg_name + ', ' + str(prediction[-1]*num_steps) + ', ' +str(real_values[-1]*num_steps) + ', ' + str(nrmse_train) + ', ' + str(nrmse_test) + ', ' + str(num_steps*average) + ', ' + str(num_steps*w_average) + ', ' + str(best) + '\n') # if(actual != 0): # norm_diff = abs(actual-predicted)/actual # else: # norm_diff = actual-predicted target2.write(pkg_name + ',' + str(normalizer) + ',' + str(testScore/normalizer) + ',' + str(testScore_average/normalizer) + ',' + str(testScore_w_average/normalizer) + ',' + str(best) + ', ' + str(len(trainY)) + ', ' + str(len(testY)) + ', ' + str(act) + ', ' + str(lstm_pred) + ', ' + str(av_pred) + ', ' + str(w_av_pred) + '\n')