123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494 |
- import numpy
- import matplotlib.pyplot as plt
- import pandas
- import math
- from keras import backend as K
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import LSTM
- from keras.layers import Activation, Dropout
- from keras.models import load_model
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- numpy.random.seed(7)
- # convert an array of values into a dataset matrix
- # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
- def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
- dataX, dataY = [], []
- for i in range(len(dataset) - look_back - num_steps):
- a = []
- for j in range(i, i + look_back):
- # a.append([dataset[j]] + meta)
- a.append([dataset[j]])
- dataX.append(a)
- mean = 0
- for j in range(num_steps):
- mean += original_dataset[i + look_back + j]
- dataY.append(mean / num_steps)
- return numpy.array(dataX), numpy.array(dataY)
- ## Calculate weighted average for comparison
- def calc_waverage(raw_av, lamda_w):
- w_average = 0
- weights = 0
- print(raw_av)
- if (raw_av.size == 0):
- w_average = 0
- return w_average
- else:
- jj = 0
- for j in raw_av:
- w_average += j * math.exp(-(len(raw_av) - jj - 1) / lamda_w)
- weights += math.exp(-(len(raw_av) - jj - 1) / lamda_w)
- jj += 1
- try:
- w_average = w_average / weights
- except ZeroDivisionError:
- print('Error:', raw_av)
- return w_average
- def normalizer(src2month, pkg, smoothing, num_steps):
- time_series = numpy.array(src2month[pkg])
- time_series = pandas.rolling_mean(time_series, window=smoothing)
- time_series = time_series[smoothing:]
- # print(len(time_series))
- # print(time_series)
- i = 0
- for month in time_series:
- if (numpy.isclose(month, 0)):
- # print(str(month))
- i += 1
- else:
- break
- # print(str(i))
- try:
- max_value = numpy.amax(time_series[i:-12]) * 9
- min_value = numpy.amin(time_series[i:-12]) * 9
- except ValueError:
- max_value = 0
- min_value = 0
- norm = max_value - min_value
- if (norm < 1):
- norm = 1
- return (norm)
- def calcf(fvalues, src2month):
- vulns20152016 = dict()
- total_top = 0
- number_top = 20
- total_low = 0
- number_low = 0
- for pkg in src2month:
- vulns20152016[pkg] = sum(src2month[pkg][-36:-12])
- vulnslist = vulns20152016.items()
- newlist = sorted(vulnslist, key=lambda k: k[1], reverse=True)
- cutoff_value = newlist[number_top][1]
- print('Cutoff value: ' + str(cutoff_value))
- for pkg in vulns20152016:
- if vulns20152016[pkg] > cutoff_value:
- total_top += vulns20152016[pkg]
- elif vulns20152016[pkg] < cutoff_value and vulns20152016[pkg] > 0:
- total_low += vulns20152016[pkg]
- number_low += 1
- average_top_9month = (total_top / number_top) * 9 / 24
- r_top = 1.05 * (average_top_9month / (9 * 30 * 4))
- average_low_9month = (total_low / number_low) * 9 / 24
- r_low = 1.05 * (average_low_9month / (9 * 30 * 4))
- for pkg in vulns20152016:
- if vulns20152016[pkg] > cutoff_value:
- fvalues[pkg] = 1 - r_top
- else:
- fvalues[pkg] = 1 - r_low
- def test_model(pkg_name, src2month, model_file, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w,
- reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last):
- model = load_model(
- './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(model_file) + '.h5')
- model.reset_states()
- totalPredict = model.predict(totalX[18:], batch_size=batch_num)
- # model.reset_states()
- totalPredict = scaler.inverse_transform(totalPredict)
- totalPredict = totalPredict.flatten()
- totalY = totalY[18:]
- totalY = totalY.reshape(-1, 1)
- totalY = scaler.inverse_transform(totalY)
- totalY = totalY.flatten()
- trainPredict = totalPredict[:-10]
- evaluatePredict = totalPredict[-10]
- testPredict = totalPredict[-1]
- model.reset_states()
- evaluation = evaluatePredict * 9
- if (evaluation < 0):
- evaluation = 0
- prediction = testPredict * 9
- if (prediction < 0):
- prediction = 0
- evaluation_reality = sum(src2month[pkg_name][-21:-12])
- reality = sum(src2month[pkg_name][-12:-3])
- if (reality == 0):
- normalizer = 1
- else:
- normalizer = reality
- evaluationerror = evaluation_reality - evaluation
- testerror = (reality - prediction) / normalizer
- print('#' * 80)
- print(pkg_name)
- print('prediction: ' + str(prediction))
- print('reality: ' + str(totalY[-1] * 9) + ' = ' + str(reality))
- print('Normalized error: ' + str(testerror))
- print('Validation error: ' + str(evaluationerror))
- # Plot
- #plt.plot(totalY, color='blue')
- #plt.plot(totalPredict, color='red')
- #plt.show()
- ## Calculate average for comparison
- raw_av = numpy.array(src2month[pkg_name])
- i = 0
- max_value = 0
- min_value = 0
- for month in raw_av:
- if (month == 0):
- i += 1
- average = sum(raw_av[i:-13]) / len(raw_av[i:-13])
- average_error = (reality - average) / normalizer
- w_average = calc_waverage(raw_av[i:-13], lamda_w)
- w_average_error = (reality - w_average) / normalizer
- last = sum(raw_av[-22:-13])
- last_error = (reality - last) / normalizer
- print(average * 9)
- print(w_average * 9)
- print(last)
- print('#' * 80)
- reality_list.append(reality)
- prediction_lstm.append(prediction)
- prediction_ave.append(average)
- prediction_wave.append(w_average)
- prediction_last.append(last)
- # if(not numpy.isinf(testerror)):
- # if(testerror>1):
- # testerror=1.0
- # if(average_error>1):
- # average_error=1.0
- # if(w_average_error>1):
- # w_average_error=1.0
- # if(last_error>1):
- # last_error=1.0
- # total_error += numpy.absolute(testerror)
- # total_ave_error += numpy.absolute(average_error)
- # total_wave_error += numpy.absolute(w_average_error)
- # total_last_error += numpy.absolute(last_error)
- # num_packages += 1
- return (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality)
- def predict(src2month, src2sloccount, src2pop, src2deps):
- ## Number of features
- feat_num = 1
- for pkg in src2month:
- src2month[pkg]=src2month[pkg][:-12]
- ## Model parameters
- do_train = True
- do_test = True
- models_num = 5
- num_steps = 9
- smoothing = num_steps
- num_neurons = 10
- look_back = 3
- lamda_w = 12
- init_test_size = 18
- pkg_num = len(src2month)
- trainXdict = dict()
- trainYdict = dict()
- testXdict = dict()
- testYdict = dict()
- train_size = int(len(src2month['linux']) - init_test_size)
- test_size = len(src2month['linux']) - train_size
- batch_num = train_size - num_steps - look_back - smoothing - 2
- print("batch_num:")
- print(batch_num)
- # create the LSTM network
- model = Sequential()
- model.add(LSTM(num_neurons, batch_input_shape=(batch_num, look_back, feat_num), activation='relu', dropout=0.5,
- stateful=True))
- # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
- # model.add(Dense(32, activation='relu'))
- # model.add(Dense(16, activation='relu'))
- model.add(Dense(1))
- model.compile(loss='mean_squared_error', optimizer='adam')
- Wsave = model.get_weights()
- scaler = MinMaxScaler(feature_range=(0, 1))
- # scaler2 = MinMaxScaler(feature_range=(0,1))
- # scaler3 = MinMaxScaler(feature_range=(0,1))
- test_scale = []
- # for i in src2month:
- # test_scale = numpy.concatenate((test_scale, src2month[i]))
- # for j in src2month[i]:
- # test_scale.append(src2month[i][j])
- # test_scale = []
- # for i in src2pop:
- # test_scale.append(src2pop[i][1])
- # scaler2.fit(test_scale)
- # test_scale = []
- # for i in src2sloccount:
- # test_scale.append(src2sloccount[i][0])
- # scaler3.fit(test_scale)
- total_trainX = []
- total_trainY = []
- flag = True
- ###################################################################################################
- for i in range(models_num):
- # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
- for pkg_name in src2month:
- pkg_num = len(src2month)
- dataset = numpy.array(src2month[pkg_name])
- dataset = dataset[:len(dataset) - 2]
- dataset = dataset.reshape(-1, 1)
- scaler.fit(dataset)
- dataset = dataset.flatten()
- #
- original_dataset = dataset
- dataset = pandas.rolling_mean(dataset, window=smoothing)
- original_dataset = original_dataset[smoothing:]
- dataset = dataset[smoothing:]
- dataset = dataset.reshape(-1, 1)
- dataset = scaler.transform(dataset)
- dataset = dataset.flatten()
- total_sum = sum(original_dataset)
- original_dataset = original_dataset.reshape(-1, 1)
- original_dataset = scaler.transform(original_dataset)
- original_dataset = original_dataset.flatten()
- #print(dataset.shape)
- #print(len(dataset))
- if (total_sum > 10):
- # reset or not between training
- model.set_weights(Wsave)
- ## ommit for rolling mean
- # normalize the dataset
- train_size = len(dataset) - init_test_size
- test_size = len(dataset) - train_size
- train_original, test_original = original_dataset[0:train_size], original_dataset[
- train_size:len(dataset)]
- train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
- print(len(train), len(test))
- # get metadata
- meta = []
- # try:
- # pop_vote = src2pop[pkg_name][1]
- # except KeyError:
- # pop_vote = 0
- # try:
- # slocs_total = src2sloccount[pkg_name][0]
- # except KeyError:
- # slocs_total = 0
- # pop_vote = scaler2.transform([[pop_vote]])
- # slocs_total = scaler3.transform([[slocs_total]])
- # meta.append(pop_vote)
- # meta.append(slocs_total)
- # reshape into X=t and Y=t+1
- trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
- testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
- # reshape input to be [samples, time steps, features]
- trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- trainY.reshape(-1, 1)
- testY.reshape(-1, 1)
- # fit the LSTM network
- if do_train:
- for j in range(100):
- model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
- model.reset_states()
- try:
- model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(
- i) + '.h5')
- except OSError:
- model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- # else:
- # try:
- # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- # except OSError:
- # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
- # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- ###################################################################################################
- # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
- target2 = open('results_paper' + str(num_steps) + '.txt', 'w')
- # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
- # for pkg_name in ['libpng']:
- reality_list = []
- prediction_lstm = []
- prediction_ave = []
- prediction_wave = []
- prediction_last = []
- num_packages = 0
- select_best = True
- fvalues = dict()
- calcf(fvalues, src2month)
- for pkg_name in src2month:
- dataset = numpy.array(src2month[pkg_name])
- dataset = dataset[:len(dataset) - 2]
- dataset = dataset.reshape(-1, 1)
- scaler.fit(dataset)
- dataset = dataset.flatten()
- #
- original_dataset = dataset
- dataset = pandas.rolling_mean(dataset, window=smoothing)
- original_dataset = original_dataset[smoothing:]
- dataset = dataset[smoothing:]
- dataset = dataset.reshape(-1, 1)
- dataset = scaler.transform(dataset)
- dataset = dataset.flatten()
- total_sum = sum(original_dataset)
- original_dataset = original_dataset.reshape(-1, 1)
- original_dataset = scaler.transform(original_dataset)
- original_dataset = original_dataset.flatten()
- if (total_sum > 10 and do_test):
- best_model = 0
- best_error = 100.0
- totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back)
- if select_best:
- for i in range(5):
- (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(
- pkg_name, src2month, i, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w,
- reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last)
- if (evaluationerror < best_error):
- best_model = i
- best_error = evaluationerror
- model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(
- best_model) + '.h5')
- model.save(
- './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + 'best' + '.h5')
- K.clear_session()
- (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name,
- src2month,
- 'best',
- totalX,
- totalY,
- scaler,
- num_steps,
- smoothing,
- batch_num,
- lamda_w,
- reality_list,
- prediction_lstm,
- prediction_ave,
- prediction_wave,
- prediction_last)
- normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
- certainty = 1 - numpy.absolute(evaluationerror / normalizer_value)
- if (certainty < 0.1):
- certainty = 0.1
- print(str(normalizer_value))
- # Plot
- # plt.plot(totalY, color='blue')
- # plt.plot(totalPredict, color='red')
- # plt.show()
- # need pkg_name, prediction, certainity, fvalues
- # TODO: save in form packageName:prediction:errorComplement:initial_expectation
- target2.write(pkg_name + ':' + str(prediction) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n')
- K.clear_session()
- else:
- raw_av = src2month[pkg_name]
- reality = sum(src2month[pkg_name][-12:-3])
- i = 0
- max_value = 0
- min_value = 0
- for month in raw_av:
- if (month == 0):
- i += 1
- w_average = calc_waverage(numpy.array(raw_av[i:-13]), lamda_w)
- normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
- certainty = 0.95
- # TODO: save in form packageName:prediction:errorComplement:initial_expectation
- target2.write(pkg_name + ':' + str(w_average) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n')
- mean_error = math.sqrt(mean_squared_error(prediction_lstm, reality_list))
- mean_ave_error = math.sqrt(mean_squared_error(prediction_ave, reality_list))
- mean_wave_error = math.sqrt(mean_squared_error(prediction_wave, reality_list))
- mean_last_error = math.sqrt(mean_squared_error(prediction_last, reality_list))
- print(mean_error)
- print(mean_ave_error)
- print(mean_wave_error)
- print(mean_last_error)
|