import numpy import matplotlib.pyplot as plt import pandas import math from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-num_steps): a = [] for j in range(i, i+look_back): #a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) mean = 0 for j in range(num_steps): mean += original_dataset[i+look_back+j] dataY.append(mean/num_steps) return numpy.array(dataX), numpy.array(dataY) ## Calculate weighted average for comparison def calc_waverage(raw_av, lamda_w): w_average = 0 weights = 0 if(len(raw_av) == 0): w_average = 0 else: jj = 0 for j in raw_av: w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w) weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w) jj += 1 w_average = w_average/weights return w_average def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 ## Model parameters do_train = False num_steps = 9 smoothing = num_steps num_neurons = 10 look_back = 4 train_flag = True test_flag = True lamda_w = 12 init_test_size = 18 pkg_num = len(src2month) training_num = len(src2month['linux']-3) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() train_size = int(len(src2month['linux']) - init_test_size) test_size = len(src2month['linux']) - train_size batch_num = train_size - num_steps - look_back - smoothing - 3 print("batch_num:") print(batch_num) # create the LSTM network model = Sequential() model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True)) # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False)) # model.add(Dense(32, activation='relu')) # model.add(Dense(16, activation='relu')) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') Wsave = model.get_weights() scaler = MinMaxScaler(feature_range=(0, 1)) #scaler2 = MinMaxScaler(feature_range=(0,1)) #scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] # for i in src2month: # test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) #test_scale = [] #for i in src2pop: # test_scale.append(src2pop[i][1]) #scaler2.fit(test_scale) #test_scale = [] #for i in src2sloccount: # test_scale.append(src2sloccount[i][0]) #scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### # for pkg_name in ['chromium-browser']: for i in range(1): # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: for pkg_name in src2month: pkg_num = len(src2month) dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-3] print(dataset.shape) dataset = dataset.reshape(-1,1) scaler.fit(dataset) print(dataset.shape) dataset = dataset.flatten() print(dataset.shape) print(len(dataset)) # if (sum(dataset)>70): # reset or not between training model.set_weights(Wsave) original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) ## ommit for rolling mean original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] # normalize the dataset dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() original_dataset = original_dataset.reshape(-1,1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() train_size = len(dataset) - init_test_size test_size = len(dataset) - train_size train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)] train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] #try: # pop_vote = src2pop[pkg_name][1] #except KeyError: # pop_vote = 0 #try: # slocs_total = src2sloccount[pkg_name][0] #except KeyError: # slocs_total = 0 #pop_vote = scaler2.transform([[pop_vote]]) #slocs_total = scaler3.transform([[slocs_total]]) #meta.append(pop_vote) #meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back) testX, testY = create_dataset(test_original, test, meta, num_steps, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) trainY.reshape(-1,1) testY.reshape(-1,1) #if len(total_trainX) == 0: # total_trainX = trainX # total_trainY = trainY #else: # total_trainX = numpy.concatenate((total_trainX, trainX)) # total_trainY = numpy.concatenate((total_trainY, trainY)) # save to dict for later trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY testXdict[pkg_name], testYdict[pkg_name] = testX, testY # fit the LSTM network if do_train: for i in range (4000): model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False) model.reset_states() try: model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') except OSError: model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') #else: # try: # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # except OSError: # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') ################################################################################################### # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w') target2 = open('results_paper' + str(num_steps) + '.txt','w') # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: # for pkg_name in ['libpng']: for pkg_name in src2month: dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-3] original_dataset = dataset dataset = dataset.reshape(-1,1) scaler.fit(dataset) dataset = dataset.flatten() original_dataset = original_dataset[smoothing:] original_dataset = original_dataset.reshape(-1,1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() if (sum(dataset)>90 and test_flag): dataset = pandas.rolling_mean(dataset, window=smoothing) #if (sum(dataset)>80): model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') #else: # model = load_model('./models/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') dataset = dataset[smoothing:] # normalize the dataset dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() model.reset_states() totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back) #trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name] #testX, testY = testXdict[pkg_name], testYdict[pkg_name] #print(trainX.shape, trainY.shape, testX.shape, testY.shape) # #print(numpy.shape(trainX), numpy.shape(testX), numpy.shape(dataset)) # make predictions totalPredict = model.predict(totalX[18:], batch_size = batch_num) #model.reset_states() #test_dataset = numpy.concatenate((trainX[len(testX):], testX)) #testPredict = model.predict(test_dataset, batch_size = batch_num) trainPredict = totalPredict[:-10] evaluatePredict = totalPredict[-10] testPredict = totalPredict[-1] model.reset_states() # invert predictions #testPredict = testPredict[-len(testX):] trainPredict = trainPredict.reshape(-1,1) trainPredict = scaler.inverse_transform(trainPredict) trainPredict = trainPredict.flatten() evaluatePredict = evaluatePredict.reshape(-1,1) evaluatePredict = scaler.inverse_transform(evaluatePredict) evaluatePredict = evaluatePredict.flatten() #trainY = trainY.reshape(-1,1) #trainY = scaler.inverse_transform(trainY) #trainY = trainY.flatten() testPredict = testPredict.reshape(-1,1) testPredict = scaler.inverse_transform(testPredict) testPredict = testPredict.flatten() prediction = testPredict[0]*9 reality = sum(src2month[pkg_name][-13:-4]) testerror = (reality - prediction)/reality print(pkg_name) print('prediction: ' + str(prediction)) print('reality: ' + str(scaler.inverse_transform(totalY[-1])[0][0]*9) + ' = ' + str(reality)) print('Normalized error: ' + str(testerror)) print('#' * 20) #print('shapes coming') #print(testY.shape) #testY = testY.reshape(-1,1) #testY = scaler.inverse_transform(testY) #testY = testY.flatten() #print(testY.shape) ## Calculate average for comparison raw_av = src2month[pkg_name] #raw_av = raw_av[:len(dataset)-4] #i = 0 #averagear = numpy.empty_like(trainY) #w_averagear = numpy.empty_like(trainY) #averagear[0] = 0 #w_averagear[0] = 0 #for i in range(1,len(averagear)): # averagear[i] = sum(raw_av[:i])/float(i) # w_averagear[i] = calc_waverage(raw_av[:i], lamda_w) #averagear_test = numpy.empty_like(testY) #w_averagear_test = numpy.empty_like(testY) #for i in range(0, len(averagear_test)): # averagear_test[i] = sum(raw_av[:len(averagear) + i])/float(len(averagear) + i) # w_averagear_test[i] = calc_waverage(raw_av[:len(averagear+i)], lamda_w) #print('average: ' + str(num_steps*average) + '\nweighted average: ' + str(num_steps*w_average)) ## calculate root mean squared error # LSTM #trainScore = math.sqrt(mean_squared_error(trainY[-9:], trainPredict[-9:])) #print('Train Score: %.2f RMSE' % (trainScore)) #testScore = math.sqrt(mean_squared_error(testY, testPredict)) #print('Test Score: %.2f RMSE' % (testScore)) # Average #trainScore_average = math.sqrt(mean_squared_error(trainY[-9:], averagear[-9:])) #testScore_average = math.sqrt(mean_squared_error(testY, averagear_test)) #trainScore_w_average = math.sqrt(mean_squared_error(trainY[-9:], w_averagear[-9:])) #testScore_w_average = math.sqrt(mean_squared_error(testY, w_averagear_test)) # Immitation #imit_train = numpy.copy(trainY) #imit_train = imit_train[:-1] #print(imit_train) #trainY_im = trainY[1:] #print(trainY_im) #imit_test = numpy.copy(testY) #imit_test = imit_test[:-1] #testY_im = testY[1:] #trainScore_imit = math.sqrt(mean_squared_error(trainY_im, imit_train)) #testScore_imit = math.sqrt(mean_squared_error(testY_im, imit_test)) # Calculate nrmse for certainty #nmax_train = numpy.amax(trainY[-24:]) #nmin_train = numpy.amin(trainY[-24:]) #nmax_test = numpy.amax(testY) #nmin_test = numpy.amin(testY) #nmax = 0 #nmin = 0 #if(nmax_train > nmax_test): # nmax = nmax_train #else: # nmax = nmax_test #if(nmin_train < nmin_test): # nmin = nmin_train #else: # nmin = nmin_test #normalizer = nmax - nmin #print('nmax: ' + str(nmax) + ' , nmin: ' + str(nmin) + ' , normalizer: ' + str(normalizer)) # plot baseline and predictions #print(numpy.shape(trainY), numpy.shape(testY)) #print(numpy.shape(trainPredict), numpy.shape(testPredict)) #real_values = numpy.concatenate((trainY, testY), axis=0) #training_fit = numpy.empty_like(real_values) #training_fit[:] = numpy.nan #training_fit[:len(trainPredict)] = trainPredict #prediction = numpy.empty_like(real_values) #prediction[:] = numpy.nan #prediction[len(trainPredict):] = testPredict #print('Actual number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(real_values[-1]*num_steps)) #act = real_values[-1]*num_steps #print('Predicted number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(prediction[-1]*num_steps)) #lstm_pred = prediction[-1]*num_steps #av_pred = averagear_test[-1]*num_steps #w_av_pred = w_averagear_test[-1]*num_steps #print(real_values) #plt.plot(real_values) #plt.plot(training_fit) #plt.plot(prediction) #plt.show() ## Caclulate better predictor #best = -1 #min_test_error = min(testScore, testScore_average, testScore_w_average) #if min_test_error == testScore: # best = 0 #elif min_test_error == testScore_average: # best = 1 #elif min_test_error == testScore_w_average: # best = 2 #print("LSTM rmse: " + str(testScore)) #print("Average rmse: " + str(testScore_average)) #print("Weighted average rmse: " + str(testScore_w_average)) # ## Write to file #target.write(pkg_name + ', ' + str(prediction[-1]*num_steps) + ', ' +str(real_values[-1]*num_steps) + ', ' + str(nrmse_train) + ', ' + str(nrmse_test) + ', ' + str(num_steps*average) + ', ' + str(num_steps*w_average) + ', ' + str(best) + '\n') # if(actual != 0): # norm_diff = abs(actual-predicted)/actual # else: # norm_diff = actual-predicted #target2.write(pkg_name + ',' + str(normalizer) + ',' + str(trainScore/normalizer) + ',' + str(trainScore_average/normalizer) + ',' + str(trainScore_w_average/normalizer) + ',' + str(best) + ', ' + str(len(trainY)) + ', ' + str(len(testY)) + ', ' + str(act) + ', ' + str(lstm_pred) + ', ' + str(av_pred) + ', ' + str(w_av_pred) + '\n')