123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426 |
- import numpy
- import matplotlib.pyplot as plt
- import pandas
- import math
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import LSTM
- from keras.layers import Activation, Dropout
- from keras.models import load_model
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- numpy.random.seed(7)
- # convert an array of values into a dataset matrix
- # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
- def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
- dataX, dataY = [], []
- for i in range(len(dataset)-look_back-num_steps):
- a = []
- for j in range(i, i+look_back):
- #a.append([dataset[j]] + meta)
- a.append([dataset[j]])
- dataX.append(a)
- mean = 0
- for j in range(num_steps):
- mean += original_dataset[i+look_back+j]
- dataY.append(mean/num_steps)
- return numpy.array(dataX), numpy.array(dataY)
- ## Calculate weighted average for comparison
- def calc_waverage(raw_av, lamda_w):
- w_average = 0
- weights = 0
- if(len(raw_av) == 0):
- w_average = 0
- else:
- jj = 0
- for j in raw_av:
- w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w)
- weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w)
- jj += 1
-
- w_average = w_average/weights
- return w_average
- def predict(src2month, src2sloccount, src2pop, src2deps):
-
- ## Number of features
- feat_num = 1
-
- ## Model parameters
- do_train = False
- num_steps = 9
- smoothing = num_steps
- num_neurons = 10
- look_back = 4
- train_flag = True
- test_flag = True
- lamda_w = 12
- init_test_size = 18
- pkg_num = len(src2month)
- training_num = len(src2month['linux']-3)
- trainXdict = dict()
- trainYdict = dict()
- testXdict = dict()
- testYdict = dict()
- train_size = int(len(src2month['linux']) - init_test_size)
- test_size = len(src2month['linux']) - train_size
- batch_num = train_size - num_steps - look_back - smoothing - 3
- print("batch_num:")
- print(batch_num)
-
- # create the LSTM network
- model = Sequential()
- model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True))
- # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
- # model.add(Dense(32, activation='relu'))
- # model.add(Dense(16, activation='relu'))
- model.add(Dense(1))
- model.compile(loss='mean_squared_error', optimizer='adam')
- Wsave = model.get_weights()
-
- scaler = MinMaxScaler(feature_range=(0, 1))
- #scaler2 = MinMaxScaler(feature_range=(0,1))
- #scaler3 = MinMaxScaler(feature_range=(0,1))
-
- test_scale = []
- # for i in src2month:
- # test_scale = numpy.concatenate((test_scale, src2month[i]))
- # for j in src2month[i]:
- # test_scale.append(src2month[i][j])
-
- #test_scale = []
- #for i in src2pop:
- # test_scale.append(src2pop[i][1])
- #scaler2.fit(test_scale)
- #test_scale = []
-
- #for i in src2sloccount:
- # test_scale.append(src2sloccount[i][0])
-
- #scaler3.fit(test_scale)
- total_trainX = []
- total_trainY = []
- flag = True
- ###################################################################################################
- # for pkg_name in ['chromium-browser']:
- for i in range(1):
- # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
- for pkg_name in src2month:
- pkg_num = len(src2month)
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-3]
- print(dataset.shape)
- dataset = dataset.reshape(-1,1)
- scaler.fit(dataset)
- print(dataset.shape)
- dataset = dataset.flatten()
- print(dataset.shape)
- print(len(dataset))
- #
- if (sum(dataset)>70):
- # reset or not between training
- model.set_weights(Wsave)
- original_dataset = dataset
- dataset = pandas.rolling_mean(dataset, window=smoothing)
- ## ommit for rolling mean
- original_dataset = original_dataset[smoothing:]
- dataset = dataset[smoothing:]
- # normalize the dataset
- dataset = dataset.reshape(-1,1)
- dataset = scaler.transform(dataset)
- dataset = dataset.flatten()
- original_dataset = original_dataset.reshape(-1,1)
- original_dataset = scaler.transform(original_dataset)
- original_dataset = original_dataset.flatten()
- train_size = len(dataset) - init_test_size
- test_size = len(dataset) - train_size
- train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)]
- train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
- print(len(train), len(test))
- # get metadata
- meta = []
- #try:
- # pop_vote = src2pop[pkg_name][1]
- #except KeyError:
- # pop_vote = 0
- #try:
- # slocs_total = src2sloccount[pkg_name][0]
- #except KeyError:
- # slocs_total = 0
-
- #pop_vote = scaler2.transform([[pop_vote]])
- #slocs_total = scaler3.transform([[slocs_total]])
- #meta.append(pop_vote)
- #meta.append(slocs_total)
- # reshape into X=t and Y=t+1
- trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
- testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
- # reshape input to be [samples, time steps, features]
- trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- trainY.reshape(-1,1)
- testY.reshape(-1,1)
- #if len(total_trainX) == 0:
- # total_trainX = trainX
- # total_trainY = trainY
- #else:
- # total_trainX = numpy.concatenate((total_trainX, trainX))
- # total_trainY = numpy.concatenate((total_trainY, trainY))
- # save to dict for later
- trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
- testXdict[pkg_name], testYdict[pkg_name] = testX, testY
-
- # fit the LSTM network
- if do_train:
- for i in range (4000):
- model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
- model.reset_states()
- try:
- model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- except OSError:
- model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- #else:
- # try:
- # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- # except OSError:
- # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
-
- ###################################################################################################
- # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
- target2 = open('results_paper' + str(num_steps) + '.txt','w')
- # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
- # for pkg_name in ['libpng']:
- for pkg_name in src2month:
-
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-3]
- original_dataset = dataset
- dataset = dataset.reshape(-1,1)
- scaler.fit(dataset)
- dataset = dataset.flatten()
- original_dataset = original_dataset[smoothing:]
- original_dataset = original_dataset.reshape(-1,1)
- original_dataset = scaler.transform(original_dataset)
- original_dataset = original_dataset.flatten()
- if (sum(dataset)>90 and test_flag):
- dataset = pandas.rolling_mean(dataset, window=smoothing)
-
- #if (sum(dataset)>80):
- model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- #else:
- # model = load_model('./models/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- dataset = dataset[smoothing:]
- # normalize the dataset
- dataset = dataset.reshape(-1,1)
- dataset = scaler.transform(dataset)
- dataset = dataset.flatten()
- model.reset_states()
-
- totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back)
- #trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
- #testX, testY = testXdict[pkg_name], testYdict[pkg_name]
- #print(trainX.shape, trainY.shape, testX.shape, testY.shape)
- #
- #print(numpy.shape(trainX), numpy.shape(testX), numpy.shape(dataset))
- # make predictions
- totalPredict = model.predict(totalX[18:], batch_size = batch_num)
- #model.reset_states()
- #test_dataset = numpy.concatenate((trainX[len(testX):], testX))
- #testPredict = model.predict(test_dataset, batch_size = batch_num)
- trainPredict = totalPredict[:-10]
- evaluatePredict = totalPredict[-10]
- testPredict = totalPredict[-1]
- model.reset_states()
- # invert predictions
- #testPredict = testPredict[-len(testX):]
- trainPredict = trainPredict.reshape(-1,1)
- trainPredict = scaler.inverse_transform(trainPredict)
- trainPredict = trainPredict.flatten()
-
- evaluatePredict = evaluatePredict.reshape(-1,1)
- evaluatePredict = scaler.inverse_transform(evaluatePredict)
- evaluatePredict = evaluatePredict.flatten()
-
- #trainY = trainY.reshape(-1,1)
- #trainY = scaler.inverse_transform(trainY)
- #trainY = trainY.flatten()
- testPredict = testPredict.reshape(-1,1)
- testPredict = scaler.inverse_transform(testPredict)
- testPredict = testPredict.flatten()
- prediction = testPredict[0]*9
- reality = sum(src2month[pkg_name][-13:-4])
- testerror = (reality - prediction)/reality
- print(pkg_name)
- print('prediction: ' + str(prediction))
- print('reality: ' + str(scaler.inverse_transform(totalY[-1])[0][0]*9) + ' = ' + str(reality))
- print('Normalized error: ' + str(testerror))
- print('#' * 20)
- #print('shapes coming')
- #print(testY.shape)
- #testY = testY.reshape(-1,1)
- #testY = scaler.inverse_transform(testY)
- #testY = testY.flatten()
- #print(testY.shape)
-
- ## Calculate average for comparison
- raw_av = src2month[pkg_name]
- #raw_av = raw_av[:len(dataset)-4]
- #i = 0
- #averagear = numpy.empty_like(trainY)
- #w_averagear = numpy.empty_like(trainY)
- #averagear[0] = 0
- #w_averagear[0] = 0
- #for i in range(1,len(averagear)):
- # averagear[i] = sum(raw_av[:i])/float(i)
- # w_averagear[i] = calc_waverage(raw_av[:i], lamda_w)
- #averagear_test = numpy.empty_like(testY)
- #w_averagear_test = numpy.empty_like(testY)
- #for i in range(0, len(averagear_test)):
- # averagear_test[i] = sum(raw_av[:len(averagear) + i])/float(len(averagear) + i)
- # w_averagear_test[i] = calc_waverage(raw_av[:len(averagear+i)], lamda_w)
- #print('average: ' + str(num_steps*average) + '\nweighted average: ' + str(num_steps*w_average))
- ## calculate root mean squared error
- # LSTM
- #trainScore = math.sqrt(mean_squared_error(trainY[-9:], trainPredict[-9:]))
- #print('Train Score: %.2f RMSE' % (trainScore))
- #testScore = math.sqrt(mean_squared_error(testY, testPredict))
- #print('Test Score: %.2f RMSE' % (testScore))
- # Average
- #trainScore_average = math.sqrt(mean_squared_error(trainY[-9:], averagear[-9:]))
- #testScore_average = math.sqrt(mean_squared_error(testY, averagear_test))
- #trainScore_w_average = math.sqrt(mean_squared_error(trainY[-9:], w_averagear[-9:]))
- #testScore_w_average = math.sqrt(mean_squared_error(testY, w_averagear_test))
- # Immitation
- #imit_train = numpy.copy(trainY)
- #imit_train = imit_train[:-1]
- #print(imit_train)
- #trainY_im = trainY[1:]
- #print(trainY_im)
-
- #imit_test = numpy.copy(testY)
- #imit_test = imit_test[:-1]
- #testY_im = testY[1:]
- #trainScore_imit = math.sqrt(mean_squared_error(trainY_im, imit_train))
- #testScore_imit = math.sqrt(mean_squared_error(testY_im, imit_test))
- # Calculate nrmse for certainty
- #nmax_train = numpy.amax(trainY[-24:])
- #nmin_train = numpy.amin(trainY[-24:])
- #nmax_test = numpy.amax(testY)
- #nmin_test = numpy.amin(testY)
- #nmax = 0
- #nmin = 0
- #if(nmax_train > nmax_test):
- # nmax = nmax_train
- #else:
- # nmax = nmax_test
-
- #if(nmin_train < nmin_test):
- # nmin = nmin_train
- #else:
- # nmin = nmin_test
-
- #normalizer = nmax - nmin
- #print('nmax: ' + str(nmax) + ' , nmin: ' + str(nmin) + ' , normalizer: ' + str(normalizer))
- # plot baseline and predictions
- #print(numpy.shape(trainY), numpy.shape(testY))
- #print(numpy.shape(trainPredict), numpy.shape(testPredict))
- #real_values = numpy.concatenate((trainY, testY), axis=0)
- #training_fit = numpy.empty_like(real_values)
- #training_fit[:] = numpy.nan
- #training_fit[:len(trainPredict)] = trainPredict
- #prediction = numpy.empty_like(real_values)
- #prediction[:] = numpy.nan
- #prediction[len(trainPredict):] = testPredict
-
-
- #print('Actual number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(real_values[-1]*num_steps))
- #act = real_values[-1]*num_steps
- #print('Predicted number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(prediction[-1]*num_steps))
- #lstm_pred = prediction[-1]*num_steps
- #av_pred = averagear_test[-1]*num_steps
- #w_av_pred = w_averagear_test[-1]*num_steps
- #print(real_values)
- #plt.plot(real_values)
- #plt.plot(training_fit)
- #plt.plot(prediction)
- #plt.show()
-
- ## Caclulate better predictor
- #best = -1
- #min_test_error = min(testScore, testScore_average, testScore_w_average)
- #if min_test_error == testScore:
- # best = 0
- #elif min_test_error == testScore_average:
- # best = 1
- #elif min_test_error == testScore_w_average:
- # best = 2
- #print("LSTM rmse: " + str(testScore))
- #print("Average rmse: " + str(testScore_average))
- #print("Weighted average rmse: " + str(testScore_w_average))
-
- # ## Write to file
- #target.write(pkg_name + ', ' + str(prediction[-1]*num_steps) + ', ' +str(real_values[-1]*num_steps) + ', ' + str(nrmse_train) + ', ' + str(nrmse_test) + ', ' + str(num_steps*average) + ', ' + str(num_steps*w_average) + ', ' + str(best) + '\n')
- # if(actual != 0):
- # norm_diff = abs(actual-predicted)/actual
- # else:
- # norm_diff = actual-predicted
- #target2.write(pkg_name + ',' + str(normalizer) + ',' + str(trainScore/normalizer) + ',' + str(trainScore_average/normalizer) + ',' + str(trainScore_w_average/normalizer) + ',' + str(best) + ', ' + str(len(trainY)) + ', ' + str(len(testY)) + ', ' + str(act) + ', ' + str(lstm_pred) + ', ' + str(av_pred) + ', ' + str(w_av_pred) + '\n')
|