import numpy import matplotlib.pyplot as plt import pandas import math from keras import backend as K from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-num_steps): a = [] for j in range(i, i+look_back): #a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) mean = 0 for j in range(num_steps): mean += original_dataset[i+look_back+j] dataY.append(mean/num_steps) return numpy.array(dataX), numpy.array(dataY) ## Calculate weighted average for comparison def calc_waverage(raw_av, lamda_w): w_average = 0 weights = 0 if(raw_av.size == 0): w_average = 0 return w_average else: jj = 0 for j in raw_av: w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w) weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w) jj += 1 try: w_average = w_average/weights except ZeroDivisionError: print('Error:', raw_av) return w_average def normalizer(src2month, pkg, smoothing, num_steps): time_series = src2month[pkg] time_series = pandas.rolling_mean(time_series, window=smoothing) time_series = time_series[smoothing:] #print(len(time_series)) #print(time_series) i = 0 for month in time_series: if(numpy.isclose(month,0)): #print(str(month)) i += 1 else: break #print(str(i)) try: max_value = numpy.amax(time_series[i:-12])*9 min_value = numpy.amin(time_series[i:-12])*9 except ValueError: max_value = 0 min_value = 0 norm = max_value-min_value if(norm<1): norm = 1 return(norm) def calcf(fvalues, src2month): vulns20152016 = dict() total_top = 0 number_top = 20 total_low = 0 number_low = 0 for pkg in src2month: vulns20152016[pkg] = sum(src2month[pkg][-36:-12]) vulnslist = vulns20152016.items() newlist = sorted(vulnslist, key=lambda k: k[1], reverse=True) cutoff_value = newlist[number_top][1] print('Cutoff value: ' + str(cutoff_value)) for pkg in vulns20152016: if vulns20152016[pkg]>cutoff_value: total_top += vulns20152016[pkg] elif vulns20152016[pkg]0: total_low += vulns20152016[pkg] number_low += 1 average_top_9month = (total_top/number_top)*9/24 r_top = 1.05*(average_top_9month/(9*30*4)) average_low_9month = (total_low/number_low)*9/24 r_low = 1.05*(average_low_9month/(9*30*4)) for pkg in vulns20152016: if vulns20152016[pkg]>cutoff_value: fvalues[pkg] = 1-r_top else: fvalues[pkg] = 1-r_low def test_model(pkg_name, src2month, model_file, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last): model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(model_file) + '.h5') model.reset_states() totalPredict = model.predict(totalX[18:], batch_size = batch_num) #model.reset_states() totalPredict = scaler.inverse_transform(totalPredict) totalPredict = totalPredict.flatten() totalY = totalY[18:] totalY = totalY.reshape(-1,1) totalY = scaler.inverse_transform(totalY) totalY = totalY.flatten() trainPredict = totalPredict[:-10] evaluatePredict = totalPredict[-10] testPredict = totalPredict[-1] model.reset_states() evaluation = evaluatePredict*9 if(evaluation<0): evaluation = 0 prediction = testPredict*9 if(prediction<0): prediction = 0 evaluation_reality = sum(src2month[pkg_name][-21:-12]) reality = sum(src2month[pkg_name][-12:-3]) if(reality==0): normalizer = 1 else: normalizer=reality evaluationerror = evaluation_reality - evaluation testerror = (reality - prediction)/normalizer print('#' * 80) print(pkg_name) print('prediction: ' + str(prediction)) print('reality: ' + str(totalY[-1]*9) + ' = ' + str(reality)) print('Normalized error: ' + str(testerror)) print('Validation error: ' + str(evaluationerror)) # Plot plt.plot(totalY, color='blue') plt.plot(totalPredict, color='red') plt.show() ## Calculate average for comparison raw_av = src2month[pkg_name] i = 0 max_value = 0 min_value = 0 for month in raw_av: if(month == 0): i += 1 average = sum(raw_av[i:-13])/len(raw_av[i:-13]) average_error = (reality-average)/normalizer w_average = calc_waverage(raw_av[i:-13], lamda_w) w_average_error = (reality-w_average)/normalizer last = sum(raw_av[-22:-13]) last_error = (reality-last)/normalizer print(average*9) print(w_average*9) print(last) print('#' * 80) reality_list.append(reality) prediction_lstm.append(prediction) prediction_ave.append(average) prediction_wave.append(w_average) prediction_last.append(last) #if(not numpy.isinf(testerror)): # if(testerror>1): # testerror=1.0 # if(average_error>1): # average_error=1.0 # if(w_average_error>1): # w_average_error=1.0 # if(last_error>1): # last_error=1.0 # total_error += numpy.absolute(testerror) # total_ave_error += numpy.absolute(average_error) # total_wave_error += numpy.absolute(w_average_error) # total_last_error += numpy.absolute(last_error) # num_packages += 1 return(prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 ## Model parameters do_train = False do_test = True models_num = 5 num_steps = 9 smoothing = num_steps num_neurons = 10 look_back = 3 lamda_w = 12 init_test_size = 18 pkg_num = len(src2month) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() train_size = int(len(src2month['linux']) - init_test_size) test_size = len(src2month['linux']) - train_size batch_num = train_size - num_steps - look_back - smoothing - 2 print("batch_num:") print(batch_num) # create the LSTM network model = Sequential() model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True)) # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False)) # model.add(Dense(32, activation='relu')) # model.add(Dense(16, activation='relu')) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') Wsave = model.get_weights() scaler = MinMaxScaler(feature_range=(0, 1)) #scaler2 = MinMaxScaler(feature_range=(0,1)) #scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] # for i in src2month: # test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) #test_scale = [] #for i in src2pop: # test_scale.append(src2pop[i][1]) #scaler2.fit(test_scale) #test_scale = [] #for i in src2sloccount: # test_scale.append(src2sloccount[i][0]) #scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### for i in range(models_num): # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: for pkg_name in src2month: pkg_num = len(src2month) dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-2] dataset = dataset.reshape(-1,1) scaler.fit(dataset) dataset = dataset.flatten() # original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() total_sum = sum(original_dataset) original_dataset = original_dataset.reshape(-1,1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() print(dataset.shape) print(len(dataset)) if (total_sum>80): # reset or not between training model.set_weights(Wsave) ## ommit for rolling mean # normalize the dataset train_size = len(dataset) - init_test_size test_size = len(dataset) - train_size train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)] train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] #try: # pop_vote = src2pop[pkg_name][1] #except KeyError: # pop_vote = 0 #try: # slocs_total = src2sloccount[pkg_name][0] #except KeyError: # slocs_total = 0 #pop_vote = scaler2.transform([[pop_vote]]) #slocs_total = scaler3.transform([[slocs_total]]) #meta.append(pop_vote) #meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back) testX, testY = create_dataset(test_original, test, meta, num_steps, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) trainY.reshape(-1,1) testY.reshape(-1,1) # fit the LSTM network if do_train: for j in range (10000): model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False) model.reset_states() try: model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(i) + '.h5') except OSError: model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') #else: # try: # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # except OSError: # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') ################################################################################################### # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w') target2 = open('results_paper' + str(num_steps) + '.txt','w') # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: # for pkg_name in ['libpng']: reality_list = [] prediction_lstm = [] prediction_ave = [] prediction_wave = [] prediction_last = [] num_packages = 0 select_best = False fvalues = dict() calcf(fvalues,src2month) for pkg_name in src2month: dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-2] dataset = dataset.reshape(-1,1) scaler.fit(dataset) dataset = dataset.flatten() # original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] dataset = dataset.reshape(-1,1) dataset = scaler.transform(dataset) dataset = dataset.flatten() total_sum = sum(original_dataset) original_dataset = original_dataset.reshape(-1,1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() if (total_sum>100 and do_test): best_model = 0 best_error = 100.0 totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back) if select_best: for i in range(5): (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name, src2month, i, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last) if(evaluationerror