import numpy import matplotlib.pyplot as plt import pandas import math from keras import backend as K from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1): dataX, dataY = [], [] for i in range(len(dataset) - look_back - num_steps): a = [] for j in range(i, i + look_back): # a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) mean = 0 for j in range(num_steps): mean += original_dataset[i + look_back + j] dataY.append(mean / num_steps) return numpy.array(dataX), numpy.array(dataY) ## Calculate weighted average for comparison def calc_waverage(raw_av, lamda_w): w_average = 0 weights = 0 print(raw_av) if (raw_av.size == 0): w_average = 0 return w_average else: jj = 0 for j in raw_av: w_average += j * math.exp(-(len(raw_av) - jj - 1) / lamda_w) weights += math.exp(-(len(raw_av) - jj - 1) / lamda_w) jj += 1 try: w_average = w_average / weights except ZeroDivisionError: print('Error:', raw_av) return w_average def normalizer(src2month, pkg, smoothing, num_steps): time_series = numpy.array(src2month[pkg]) time_series = pandas.rolling_mean(time_series, window=smoothing) time_series = time_series[smoothing:] # print(len(time_series)) # print(time_series) i = 0 for month in time_series: if (numpy.isclose(month, 0)): # print(str(month)) i += 1 else: break # print(str(i)) try: max_value = numpy.amax(time_series[i:-12]) * 9 min_value = numpy.amin(time_series[i:-12]) * 9 except ValueError: max_value = 0 min_value = 0 norm = max_value - min_value if (norm < 1): norm = 1 return (norm) def calcf(fvalues, src2month): vulns20152016 = dict() total_top = 0 number_top = 20 total_low = 0 number_low = 0 for pkg in src2month: vulns20152016[pkg] = sum(src2month[pkg][-36:-12]) vulnslist = vulns20152016.items() newlist = sorted(vulnslist, key=lambda k: k[1], reverse=True) cutoff_value = newlist[number_top][1] print('Cutoff value: ' + str(cutoff_value)) for pkg in vulns20152016: if vulns20152016[pkg] > cutoff_value: total_top += vulns20152016[pkg] elif vulns20152016[pkg] < cutoff_value and vulns20152016[pkg] > 0: total_low += vulns20152016[pkg] number_low += 1 average_top_9month = (total_top / number_top) * 9 / 24 r_top = 1.05 * (average_top_9month / (9 * 30 * 4)) average_low_9month = (total_low / number_low) * 9 / 24 r_low = 1.05 * (average_low_9month / (9 * 30 * 4)) for pkg in vulns20152016: if vulns20152016[pkg] > cutoff_value: fvalues[pkg] = 1 - r_top else: fvalues[pkg] = 1 - r_low def test_model(pkg_name, src2month, model_file, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last): model = load_model( './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(model_file) + '.h5') model.reset_states() totalPredict = model.predict(totalX[18:], batch_size=batch_num) # model.reset_states() totalPredict = scaler.inverse_transform(totalPredict) totalPredict = totalPredict.flatten() totalY = totalY[18:] totalY = totalY.reshape(-1, 1) totalY = scaler.inverse_transform(totalY) totalY = totalY.flatten() trainPredict = totalPredict[:-10] evaluatePredict = totalPredict[-10] testPredict = totalPredict[-1] model.reset_states() evaluation = evaluatePredict * 9 if (evaluation < 0): evaluation = 0 prediction = testPredict * 9 if (prediction < 0): prediction = 0 evaluation_reality = sum(src2month[pkg_name][-21:-12]) reality = sum(src2month[pkg_name][-12:-3]) if (reality == 0): normalizer = 1 else: normalizer = reality evaluationerror = evaluation_reality - evaluation testerror = (reality - prediction) / normalizer print('#' * 80) print(pkg_name) print('prediction: ' + str(prediction)) print('reality: ' + str(totalY[-1] * 9) + ' = ' + str(reality)) print('Normalized error: ' + str(testerror)) print('Validation error: ' + str(evaluationerror)) # Plot #plt.plot(totalY, color='blue') #plt.plot(totalPredict, color='red') #plt.show() ## Calculate average for comparison raw_av = numpy.array(src2month[pkg_name]) i = 0 max_value = 0 min_value = 0 for month in raw_av: if (month == 0): i += 1 average = sum(raw_av[i:-13]) / len(raw_av[i:-13]) average_error = (reality - average) / normalizer w_average = calc_waverage(raw_av[i:-13], lamda_w) w_average_error = (reality - w_average) / normalizer last = sum(raw_av[-22:-13]) last_error = (reality - last) / normalizer print(average * 9) print(w_average * 9) print(last) print('#' * 80) reality_list.append(reality) prediction_lstm.append(prediction) prediction_ave.append(average) prediction_wave.append(w_average) prediction_last.append(last) # if(not numpy.isinf(testerror)): # if(testerror>1): # testerror=1.0 # if(average_error>1): # average_error=1.0 # if(w_average_error>1): # w_average_error=1.0 # if(last_error>1): # last_error=1.0 # total_error += numpy.absolute(testerror) # total_ave_error += numpy.absolute(average_error) # total_wave_error += numpy.absolute(w_average_error) # total_last_error += numpy.absolute(last_error) # num_packages += 1 return (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 for pkg in src2month: src2month[pkg]=src2month[pkg][:-12] ## Model parameters do_train = True do_test = True models_num = 5 num_steps = 9 smoothing = num_steps num_neurons = 10 look_back = 3 lamda_w = 12 init_test_size = 18 pkg_num = len(src2month) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() train_size = int(len(src2month['linux']) - init_test_size) test_size = len(src2month['linux']) - train_size batch_num = train_size - num_steps - look_back - smoothing - 2 print("batch_num:") print(batch_num) # create the LSTM network model = Sequential() model.add(LSTM(num_neurons, batch_input_shape=(batch_num, look_back, feat_num), activation='relu', dropout=0.5, stateful=True)) # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False)) # model.add(Dense(32, activation='relu')) # model.add(Dense(16, activation='relu')) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') Wsave = model.get_weights() scaler = MinMaxScaler(feature_range=(0, 1)) # scaler2 = MinMaxScaler(feature_range=(0,1)) # scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] # for i in src2month: # test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) # test_scale = [] # for i in src2pop: # test_scale.append(src2pop[i][1]) # scaler2.fit(test_scale) # test_scale = [] # for i in src2sloccount: # test_scale.append(src2sloccount[i][0]) # scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### for i in range(models_num): # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: for pkg_name in src2month: pkg_num = len(src2month) dataset = numpy.array(src2month[pkg_name]) dataset = dataset[:len(dataset) - 2] dataset = dataset.reshape(-1, 1) scaler.fit(dataset) dataset = dataset.flatten() # original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] dataset = dataset.reshape(-1, 1) dataset = scaler.transform(dataset) dataset = dataset.flatten() total_sum = sum(original_dataset) original_dataset = original_dataset.reshape(-1, 1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() #print(dataset.shape) #print(len(dataset)) if (total_sum > 10): # reset or not between training model.set_weights(Wsave) ## ommit for rolling mean # normalize the dataset train_size = len(dataset) - init_test_size test_size = len(dataset) - train_size train_original, test_original = original_dataset[0:train_size], original_dataset[ train_size:len(dataset)] train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] # try: # pop_vote = src2pop[pkg_name][1] # except KeyError: # pop_vote = 0 # try: # slocs_total = src2sloccount[pkg_name][0] # except KeyError: # slocs_total = 0 # pop_vote = scaler2.transform([[pop_vote]]) # slocs_total = scaler3.transform([[slocs_total]]) # meta.append(pop_vote) # meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back) testX, testY = create_dataset(test_original, test, meta, num_steps, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) trainY.reshape(-1, 1) testY.reshape(-1, 1) # fit the LSTM network if do_train: for j in range(100): model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False) model.reset_states() try: model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str( i) + '.h5') except OSError: model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # else: # try: # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # except OSError: # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5') # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') ################################################################################################### # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w') target2 = open('results_paper' + str(num_steps) + '.txt', 'w') # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']: # for pkg_name in ['libpng']: reality_list = [] prediction_lstm = [] prediction_ave = [] prediction_wave = [] prediction_last = [] num_packages = 0 select_best = True fvalues = dict() calcf(fvalues, src2month) for pkg_name in src2month: dataset = numpy.array(src2month[pkg_name]) dataset = dataset[:len(dataset) - 2] dataset = dataset.reshape(-1, 1) scaler.fit(dataset) dataset = dataset.flatten() # original_dataset = dataset dataset = pandas.rolling_mean(dataset, window=smoothing) original_dataset = original_dataset[smoothing:] dataset = dataset[smoothing:] dataset = dataset.reshape(-1, 1) dataset = scaler.transform(dataset) dataset = dataset.flatten() total_sum = sum(original_dataset) original_dataset = original_dataset.reshape(-1, 1) original_dataset = scaler.transform(original_dataset) original_dataset = original_dataset.flatten() if (total_sum > 10 and do_test): best_model = 0 best_error = 100.0 totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back) if select_best: for i in range(5): (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model( pkg_name, src2month, i, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last) if (evaluationerror < best_error): best_model = i best_error = evaluationerror model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str( best_model) + '.h5') model.save( './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + 'best' + '.h5') K.clear_session() (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name, src2month, 'best', totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last) normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps) certainty = 1 - numpy.absolute(evaluationerror / normalizer_value) if (certainty < 0.1): certainty = 0.1 print(str(normalizer_value)) # Plot # plt.plot(totalY, color='blue') # plt.plot(totalPredict, color='red') # plt.show() # need pkg_name, prediction, certainity, fvalues # TODO: save in form packageName:prediction:errorComplement:initial_expectation target2.write(pkg_name + ':' + str(prediction) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n') K.clear_session() else: raw_av = src2month[pkg_name] reality = sum(src2month[pkg_name][-12:-3]) i = 0 max_value = 0 min_value = 0 for month in raw_av: if (month == 0): i += 1 w_average = calc_waverage(numpy.array(raw_av[i:-13]), lamda_w) normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps) certainty = 0.95 # TODO: save in form packageName:prediction:errorComplement:initial_expectation target2.write(pkg_name + ':' + str(w_average) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n') mean_error = math.sqrt(mean_squared_error(prediction_lstm, reality_list)) mean_ave_error = math.sqrt(mean_squared_error(prediction_ave, reality_list)) mean_wave_error = math.sqrt(mean_squared_error(prediction_wave, reality_list)) mean_last_error = math.sqrt(mean_squared_error(prediction_last, reality_list)) print(mean_error) print(mean_ave_error) print(mean_wave_error) print(mean_last_error)