import numpy import matplotlib.pyplot as plt import pandas import math from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix def create_dataset(dataset, meta, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-1): a = [] for j in range(i, i+look_back): #a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) dataY.append(dataset[i + look_back]) return numpy.array(dataX), numpy.array(dataY) def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 ## Number of steps in the future to predict - also affects smoothing num_steps = 6 pkg_num = len(src2month) training_num = len(src2month['linux']-8) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() look_back = 10 # create the LSTM network model = Sequential() model.add(LSTM(32, input_shape = (look_back, feat_num) , activation ='relu', dropout_W=0.5)) # model.add(LSTM(16, return_sequences=True)) # model.add(LSTM(8)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.5)) # model.add(Dense(64, activation='relu')) # model.add(Dropout(0.5)) # model.add(Dense(64, activation='relu')) # model.add(Dropout(0.5)) # model.add(LSTM(4, input_dim=look_back-6, dropout_W = 0.2, dropout_U = 0.1)) model.add(Dense(1), activation='linear') model.compile(loss='mean_squared_error', optimizer='adam') scaler = MinMaxScaler(feature_range=(0, 1)) # create the additional NN # model2 = Sequential() # model2.add scaler2 = MinMaxScaler(feature_range=(0,1)) scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] for i in src2month: test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) scaler.fit(test_scale) test_scale = [] for i in src2pop: test_scale.append(src2pop[i][1]) scaler2.fit(test_scale) test_scale = [] for i in src2sloccount: test_scale.append(src2sloccount[i][0]) scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### for i in range(0,1): for pkg_name in src2month: # for pkg_name in ['chromium-browser', 'linux']: # for pkg_name in ['linux']: pkg_num = len(src2month) dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-8] print(len(dataset)) # if sum(dataset)>50: # dataset = pandas.rolling_mean(dataset, window=num_steps) ## ommit 3 for rolling mean dataset = dataset[num_steps:] # normalize the dataset dataset = scaler.transform(dataset) train_size = int(len(dataset) * 0.90) test_size = len(dataset) - train_size train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] try: pop_vote = src2pop[pkg_name][1] except KeyError: pop_vote = 0 try: slocs_total = src2sloccount[pkg_name][0] except KeyError: slocs_total = 0 pop_vote = scaler2.transform([[pop_vote]]) slocs_total = scaler3.transform([[slocs_total]]) meta.append(pop_vote) meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train, meta, look_back) testX, testY = create_dataset(test, meta, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) if len(total_trainX) == 0: total_trainX = trainX total_trainY = trainY else: total_trainX = numpy.concatenate((total_trainX, trainX)) total_trainY = numpy.concatenate((total_trainY, trainY)) # save to dict for later trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY testXdict[pkg_name], testYdict[pkg_name] = testX, testY # fit the LSTM network #model.fit([trainX], [trainY], nb_epoch=50, batch_size=50, verbose=2) ################################################################################################### # fit the LSTM network # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2) model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') target = open('output5-3-lb' + str(look_back) + '.txt','w') target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w') # for pkg_name in ['chromium-browser', 'linux']: for pkg_name in src2month: dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-8] if sum(dataset)>50: trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name] testX, testY = testXdict[pkg_name], testYdict[pkg_name] dataset = pandas.rolling_mean(dataset, window=num_steps) dataset = dataset[num_steps:] # normalize the dataset dataset = scaler.transform(dataset) # Freshen LSTM # train_size = int(len(dataset) * 0.90) # test_size = len(dataset) - train_size # train, test = dataset[0:train_size], dataset[train_size:len(dataset)] # print(len(train), len(test)) # # # reshape into X=t and Y=t+1 # trainX, trainY = create_dataset(train, look_back) # testX, testY = create_dataset(test, look_back) # # # reshape input to be [samples, time steps, features] # trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) # testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1])) # # # fit the LSTM network # model.fit(trainX, trainY, nb_epoch=10, batch_size=1, verbose=2) # make predictions trainPredict = model.predict(trainX) testPredict = model.predict(testX) # invert predictions trainPredict = scaler.inverse_transform(trainPredict) trainY = scaler.inverse_transform([trainY]) testPredict = scaler.inverse_transform(testPredict) testY = scaler.inverse_transform([testY]) print(str(len(dataset))) # calculate root mean squared error print('Package: ' + pkg_name) #print(trainY) #print(trainPredict) trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0])) print('Train Score: %.2f RMSE' % (trainScore)) testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0])) print('Test Score: %.2f RMSE' % (testScore)) # shift train predictions for plotting trainPredictPlot = numpy.empty_like(dataset) trainPredictPlot[:] = numpy.nan trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0] # shift test predictions for plotting testPredictPlot = numpy.empty_like(dataset) testPredictPlot[:] = numpy.nan testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0] # plot baseline and predictions plt.plot(scaler.inverse_transform(dataset)) plt.plot(trainPredictPlot) plt.plot(testPredictPlot) plt.show() # output initial predictions #print('Actual number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(scaler.inverse_transform(dataset)[len(dataset)-2])) #print('Predicted number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(testPredictPlot[len(testPredictPlot)-2])) # save data to file target.write(pkg_name + '\n') target.write(str(dataset)) target.write('\n') target.write(str(trainPredict)) target.write(str(testPredict)) target.write('\n') # load metadata again meta = [] try: pop_vote = src2pop[pkg_name][1] except KeyError: pop_vote = 0 try: slocs_total = src2sloccount[pkg_name][0] except KeyError: slocs_total = 0 pop_vote = scaler2.transform([[pop_vote]]) slocs_total = scaler3.transform([[slocs_total]]) meta.append(pop_vote) meta.append(slocs_total) # make real predictions step by step l = len(dataset) pred_sofar = [] for step in range(num_steps): input_list_pre = dataset[l-num_steps-1-look_back+step:l-num_steps-1] input_list = [] for dat in input_list_pre: #input_list.append([dat]+meta) input_list.append([dat]) for dat in pred_sofar: #input_list.append([dat]+meta) input_list.append([dat]) pred_input = numpy.array(input_list) pred_input = numpy.reshape(pred_input, (1, pred_input.shape[0], pred_input.shape[1])) print(pred_input) pred = model.predict(pred_input) print("Prediction:") print(scaler.inverse_transform(pred)[0][0]) pred_sofar.append(pred[0][0]) actual = int(round(num_steps * scaler.inverse_transform(dataset)[len(dataset)-1])) predicted = int(round(num_steps * scaler.inverse_transform(pred_sofar)[(num_steps-1)])) print('Actual Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(actual)) print('Predicted Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(predicted)) ## Calculate average for comparison raw_av = src2month[pkg_name] raw_av = raw_av[:len(dataset)-8] i = 0 while(i