123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318 |
- import numpy
- import matplotlib.pyplot as plt
- import pandas
- import math
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import LSTM
- from keras.layers import Activation, Dropout
- from keras.models import load_model
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- numpy.random.seed(7)
- # convert an array of values into a dataset matrix
- def create_dataset(dataset, meta, look_back=1):
- dataX, dataY = [], []
- for i in range(len(dataset)-look_back-1):
- a = []
- for j in range(i, i+look_back):
- #a.append([dataset[j]] + meta)
- a.append([dataset[j]])
- dataX.append(a)
- dataY.append(dataset[i + look_back])
- return numpy.array(dataX), numpy.array(dataY)
- def predict(src2month, src2sloccount, src2pop, src2deps):
-
- ## Number of features
- feat_num = 1
-
- ## Number of steps in the future to predict - also affects smoothing
- num_steps = 6
- pkg_num = len(src2month)
- training_num = len(src2month['linux']-8)
- trainXdict = dict()
- trainYdict = dict()
- testXdict = dict()
- testYdict = dict()
- look_back = 10
- # create the LSTM network
- model = Sequential()
- model.add(LSTM(32, input_shape = (look_back, feat_num) , activation ='relu', dropout_W=0.5))
- # model.add(LSTM(16, return_sequences=True))
- # model.add(LSTM(8))
- model.add(Dense(64, activation='relu'))
- model.add(Dropout(0.5))
- # model.add(Dense(64, activation='relu'))
- # model.add(Dropout(0.5))
- # model.add(Dense(64, activation='relu'))
- # model.add(Dropout(0.5))
- # model.add(LSTM(4, input_dim=look_back-6, dropout_W = 0.2, dropout_U = 0.1))
- model.add(Dense(1), activation='linear')
- model.compile(loss='mean_squared_error', optimizer='adam')
-
- scaler = MinMaxScaler(feature_range=(0, 1))
-
- # create the additional NN
- # model2 = Sequential()
- # model2.add
- scaler2 = MinMaxScaler(feature_range=(0,1))
- scaler3 = MinMaxScaler(feature_range=(0,1))
-
- test_scale = []
- for i in src2month:
- test_scale = numpy.concatenate((test_scale, src2month[i]))
- # for j in src2month[i]:
- # test_scale.append(src2month[i][j])
- scaler.fit(test_scale)
-
- test_scale = []
- for i in src2pop:
- test_scale.append(src2pop[i][1])
- scaler2.fit(test_scale)
- test_scale = []
-
- for i in src2sloccount:
- test_scale.append(src2sloccount[i][0])
-
- scaler3.fit(test_scale)
- total_trainX = []
- total_trainY = []
- flag = True
- ###################################################################################################
- for i in range(0,1):
- for pkg_name in src2month:
- # for pkg_name in ['chromium-browser', 'linux']:
- # for pkg_name in ['linux']:
- pkg_num = len(src2month)
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-8]
- print(len(dataset))
- #
- if sum(dataset)>50:
- #
- dataset = pandas.rolling_mean(dataset, window=num_steps)
- ## ommit 3 for rolling mean
- dataset = dataset[num_steps:]
- # normalize the dataset
- dataset = scaler.transform(dataset)
- train_size = int(len(dataset) * 0.90)
- test_size = len(dataset) - train_size
- train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
- print(len(train), len(test))
- # get metadata
- meta = []
- try:
- pop_vote = src2pop[pkg_name][1]
- except KeyError:
- pop_vote = 0
- try:
- slocs_total = src2sloccount[pkg_name][0]
- except KeyError:
- slocs_total = 0
-
- pop_vote = scaler2.transform([[pop_vote]])
- slocs_total = scaler3.transform([[slocs_total]])
- meta.append(pop_vote)
- meta.append(slocs_total)
- # reshape into X=t and Y=t+1
- trainX, trainY = create_dataset(train, meta, look_back)
- testX, testY = create_dataset(test, meta, look_back)
- # reshape input to be [samples, time steps, features]
- trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- if len(total_trainX) == 0:
- total_trainX = trainX
- total_trainY = trainY
- else:
- total_trainX = numpy.concatenate((total_trainX, trainX))
- total_trainY = numpy.concatenate((total_trainY, trainY))
- # save to dict for later
- trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
- testXdict[pkg_name], testYdict[pkg_name] = testX, testY
-
- # fit the LSTM network
- #model.fit([trainX], [trainY], nb_epoch=50, batch_size=50, verbose=2)
-
- ###################################################################################################
- # fit the LSTM network
- # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2)
- model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- target = open('output5-3-lb' + str(look_back) + '.txt','w')
- target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w')
- # for pkg_name in ['chromium-browser', 'linux']:
- for pkg_name in src2month:
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-8]
- if sum(dataset)>50:
-
- trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
- testX, testY = testXdict[pkg_name], testYdict[pkg_name]
- dataset = pandas.rolling_mean(dataset, window=num_steps)
- dataset = dataset[num_steps:]
- # normalize the dataset
- dataset = scaler.transform(dataset)
- # Freshen LSTM
- # train_size = int(len(dataset) * 0.90)
- # test_size = len(dataset) - train_size
- # train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
- # print(len(train), len(test))
- #
- # # reshape into X=t and Y=t+1
- # trainX, trainY = create_dataset(train, look_back)
- # testX, testY = create_dataset(test, look_back)
- #
- # # reshape input to be [samples, time steps, features]
- # trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
- # testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
- #
- # # fit the LSTM network
- # model.fit(trainX, trainY, nb_epoch=10, batch_size=1, verbose=2)
- # make predictions
- trainPredict = model.predict(trainX)
- testPredict = model.predict(testX)
- # invert predictions
- trainPredict = scaler.inverse_transform(trainPredict)
- trainY = scaler.inverse_transform([trainY])
- testPredict = scaler.inverse_transform(testPredict)
- testY = scaler.inverse_transform([testY])
- print(str(len(dataset)))
- # calculate root mean squared error
- print('Package: ' + pkg_name)
- #print(trainY)
- #print(trainPredict)
- trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
- print('Train Score: %.2f RMSE' % (trainScore))
- testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
- print('Test Score: %.2f RMSE' % (testScore))
- # shift train predictions for plotting
- trainPredictPlot = numpy.empty_like(dataset)
- trainPredictPlot[:] = numpy.nan
- trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
- # shift test predictions for plotting
- testPredictPlot = numpy.empty_like(dataset)
- testPredictPlot[:] = numpy.nan
- testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
- # plot baseline and predictions
- plt.plot(scaler.inverse_transform(dataset))
- plt.plot(trainPredictPlot)
- plt.plot(testPredictPlot)
- plt.show()
- # output initial predictions
- #print('Actual number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(scaler.inverse_transform(dataset)[len(dataset)-2]))
- #print('Predicted number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(testPredictPlot[len(testPredictPlot)-2]))
- # save data to file
- target.write(pkg_name + '\n')
- target.write(str(dataset))
- target.write('\n')
- target.write(str(trainPredict))
- target.write(str(testPredict))
- target.write('\n')
-
-
- # load metadata again
- meta = []
- try:
- pop_vote = src2pop[pkg_name][1]
- except KeyError:
- pop_vote = 0
- try:
- slocs_total = src2sloccount[pkg_name][0]
- except KeyError:
- slocs_total = 0
-
- pop_vote = scaler2.transform([[pop_vote]])
- slocs_total = scaler3.transform([[slocs_total]])
- meta.append(pop_vote)
- meta.append(slocs_total)
-
-
-
-
-
-
-
-
- # make real predictions step by step
-
- l = len(dataset)
- pred_sofar = []
- for step in range(num_steps):
- input_list_pre = dataset[l-num_steps-1-look_back+step:l-num_steps-1]
- input_list = []
- for dat in input_list_pre:
- #input_list.append([dat]+meta)
- input_list.append([dat])
- for dat in pred_sofar:
- #input_list.append([dat]+meta)
- input_list.append([dat])
- pred_input = numpy.array(input_list)
- pred_input = numpy.reshape(pred_input, (1, pred_input.shape[0], pred_input.shape[1]))
- print(pred_input)
- pred = model.predict(pred_input)
- print("Prediction:")
- print(scaler.inverse_transform(pred)[0][0])
- pred_sofar.append(pred[0][0])
- actual = int(round(num_steps * scaler.inverse_transform(dataset)[len(dataset)-1]))
- predicted = int(round(num_steps * scaler.inverse_transform(pred_sofar)[(num_steps-1)]))
- print('Actual Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(actual))
- print('Predicted Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(predicted))
-
- ## Calculate average for comparison
- raw_av = src2month[pkg_name]
- raw_av = raw_av[:len(dataset)-8]
- i = 0
-
- while(i<len(raw_av) and raw_av[i]<1 ):
- i = i + 1
- raw_av = raw_av[i:]
- if(len(raw_av) == 0):
- average = 0
- else:
- average = num_steps * int(round(sum(raw_av) / float(len(raw_av))))
-
- ## Write to file
- if(actual != 0):
- norm_diff = abs(actual-predicted)/actual
- else:
- norm_diff = actual-predicted
- target2.write(pkg_name + ',' + str(actual) + ',' + str(predicted) + ',' + str(average) + ',' + str(norm_diff) + '\n')
|