Lstm.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras import backend as K
  6. from keras.models import Sequential
  7. from keras.layers import Dense
  8. from keras.layers import LSTM
  9. from keras.layers import Activation, Dropout
  10. from keras.models import load_model
  11. from sklearn.preprocessing import MinMaxScaler
  12. from sklearn.metrics import mean_squared_error
  13. numpy.random.seed(7)
  14. # convert an array of values into a dataset matrix
  15. # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
  16. def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
  17. dataX, dataY = [], []
  18. for i in range(len(dataset) - look_back - num_steps):
  19. a = []
  20. for j in range(i, i + look_back):
  21. # a.append([dataset[j]] + meta)
  22. a.append([dataset[j]])
  23. dataX.append(a)
  24. mean = 0
  25. for j in range(num_steps):
  26. mean += original_dataset[i + look_back + j]
  27. dataY.append(mean / num_steps)
  28. return numpy.array(dataX), numpy.array(dataY)
  29. ## Calculate weighted average for comparison
  30. def calc_waverage(raw_av, lamda_w):
  31. w_average = 0
  32. weights = 0
  33. print(raw_av)
  34. if (raw_av.size == 0):
  35. w_average = 0
  36. return w_average
  37. else:
  38. jj = 0
  39. for j in raw_av:
  40. w_average += j * math.exp(-(len(raw_av) - jj - 1) / lamda_w)
  41. weights += math.exp(-(len(raw_av) - jj - 1) / lamda_w)
  42. jj += 1
  43. try:
  44. w_average = w_average / weights
  45. except ZeroDivisionError:
  46. print('Error:', raw_av)
  47. return w_average
  48. def normalizer(src2month, pkg, smoothing, num_steps):
  49. time_series = numpy.array(src2month[pkg])
  50. time_series = pandas.rolling_mean(time_series, window=smoothing)
  51. time_series = time_series[smoothing:]
  52. # print(len(time_series))
  53. # print(time_series)
  54. i = 0
  55. for month in time_series:
  56. if (numpy.isclose(month, 0)):
  57. # print(str(month))
  58. i += 1
  59. else:
  60. break
  61. # print(str(i))
  62. try:
  63. max_value = numpy.amax(time_series[i:-12]) * 9
  64. min_value = numpy.amin(time_series[i:-12]) * 9
  65. except ValueError:
  66. max_value = 0
  67. min_value = 0
  68. norm = max_value - min_value
  69. if (norm < 1):
  70. norm = 1
  71. return (norm)
  72. def calcf(fvalues, src2month):
  73. vulns20152016 = dict()
  74. total_top = 0
  75. number_top = 20
  76. total_low = 0
  77. number_low = 0
  78. for pkg in src2month:
  79. vulns20152016[pkg] = sum(src2month[pkg][-36:-12])
  80. vulnslist = vulns20152016.items()
  81. newlist = sorted(vulnslist, key=lambda k: k[1], reverse=True)
  82. cutoff_value = newlist[number_top][1]
  83. print('Cutoff value: ' + str(cutoff_value))
  84. for pkg in vulns20152016:
  85. if vulns20152016[pkg] > cutoff_value:
  86. total_top += vulns20152016[pkg]
  87. elif vulns20152016[pkg] < cutoff_value and vulns20152016[pkg] > 0:
  88. total_low += vulns20152016[pkg]
  89. number_low += 1
  90. average_top_9month = (total_top / number_top) * 9 / 24
  91. r_top = 1.05 * (average_top_9month / (9 * 30 * 4))
  92. average_low_9month = (total_low / number_low) * 9 / 24
  93. r_low = 1.05 * (average_low_9month / (9 * 30 * 4))
  94. for pkg in vulns20152016:
  95. if vulns20152016[pkg] > cutoff_value:
  96. fvalues[pkg] = 1 - r_top
  97. else:
  98. fvalues[pkg] = 1 - r_low
  99. def test_model(pkg_name, src2month, model_file, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w,
  100. reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last):
  101. model = load_model(
  102. './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(model_file) + '.h5')
  103. model.reset_states()
  104. totalPredict = model.predict(totalX[18:], batch_size=batch_num)
  105. # model.reset_states()
  106. totalPredict = scaler.inverse_transform(totalPredict)
  107. totalPredict = totalPredict.flatten()
  108. totalY = totalY[18:]
  109. totalY = totalY.reshape(-1, 1)
  110. totalY = scaler.inverse_transform(totalY)
  111. totalY = totalY.flatten()
  112. trainPredict = totalPredict[:-10]
  113. evaluatePredict = totalPredict[-10]
  114. testPredict = totalPredict[-1]
  115. model.reset_states()
  116. evaluation = evaluatePredict * 9
  117. if (evaluation < 0):
  118. evaluation = 0
  119. prediction = testPredict * 9
  120. if (prediction < 0):
  121. prediction = 0
  122. evaluation_reality = sum(src2month[pkg_name][-21:-12])
  123. reality = sum(src2month[pkg_name][-12:-3])
  124. if (reality == 0):
  125. normalizer = 1
  126. else:
  127. normalizer = reality
  128. evaluationerror = evaluation_reality - evaluation
  129. testerror = (reality - prediction) / normalizer
  130. print('#' * 80)
  131. print(pkg_name)
  132. print('prediction: ' + str(prediction))
  133. print('reality: ' + str(totalY[-1] * 9) + ' = ' + str(reality))
  134. print('Normalized error: ' + str(testerror))
  135. print('Validation error: ' + str(evaluationerror))
  136. # Plot
  137. #plt.plot(totalY, color='blue')
  138. #plt.plot(totalPredict, color='red')
  139. #plt.show()
  140. ## Calculate average for comparison
  141. raw_av = numpy.array(src2month[pkg_name])
  142. i = 0
  143. max_value = 0
  144. min_value = 0
  145. for month in raw_av:
  146. if (month == 0):
  147. i += 1
  148. average = sum(raw_av[i:-13]) / len(raw_av[i:-13])
  149. average_error = (reality - average) / normalizer
  150. w_average = calc_waverage(raw_av[i:-13], lamda_w)
  151. w_average_error = (reality - w_average) / normalizer
  152. last = sum(raw_av[-22:-13])
  153. last_error = (reality - last) / normalizer
  154. print(average * 9)
  155. print(w_average * 9)
  156. print(last)
  157. print('#' * 80)
  158. reality_list.append(reality)
  159. prediction_lstm.append(prediction)
  160. prediction_ave.append(average)
  161. prediction_wave.append(w_average)
  162. prediction_last.append(last)
  163. # if(not numpy.isinf(testerror)):
  164. # if(testerror>1):
  165. # testerror=1.0
  166. # if(average_error>1):
  167. # average_error=1.0
  168. # if(w_average_error>1):
  169. # w_average_error=1.0
  170. # if(last_error>1):
  171. # last_error=1.0
  172. # total_error += numpy.absolute(testerror)
  173. # total_ave_error += numpy.absolute(average_error)
  174. # total_wave_error += numpy.absolute(w_average_error)
  175. # total_last_error += numpy.absolute(last_error)
  176. # num_packages += 1
  177. return (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality)
  178. def predict(src2month, src2sloccount, src2pop, src2deps):
  179. ## Number of features
  180. feat_num = 1
  181. for pkg in src2month:
  182. src2month[pkg]=src2month[pkg][:-12]
  183. ## Model parameters
  184. do_train = True
  185. do_test = True
  186. models_num = 5
  187. num_steps = 9
  188. smoothing = num_steps
  189. num_neurons = 10
  190. look_back = 3
  191. lamda_w = 12
  192. init_test_size = 18
  193. pkg_num = len(src2month)
  194. trainXdict = dict()
  195. trainYdict = dict()
  196. testXdict = dict()
  197. testYdict = dict()
  198. train_size = int(len(src2month['linux']) - init_test_size)
  199. test_size = len(src2month['linux']) - train_size
  200. batch_num = train_size - num_steps - look_back - smoothing - 2
  201. print("batch_num:")
  202. print(batch_num)
  203. # create the LSTM network
  204. model = Sequential()
  205. model.add(LSTM(num_neurons, batch_input_shape=(batch_num, look_back, feat_num), activation='relu', dropout=0.5,
  206. stateful=True))
  207. # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
  208. # model.add(Dense(32, activation='relu'))
  209. # model.add(Dense(16, activation='relu'))
  210. model.add(Dense(1))
  211. model.compile(loss='mean_squared_error', optimizer='adam')
  212. Wsave = model.get_weights()
  213. scaler = MinMaxScaler(feature_range=(0, 1))
  214. # scaler2 = MinMaxScaler(feature_range=(0,1))
  215. # scaler3 = MinMaxScaler(feature_range=(0,1))
  216. test_scale = []
  217. # for i in src2month:
  218. # test_scale = numpy.concatenate((test_scale, src2month[i]))
  219. # for j in src2month[i]:
  220. # test_scale.append(src2month[i][j])
  221. # test_scale = []
  222. # for i in src2pop:
  223. # test_scale.append(src2pop[i][1])
  224. # scaler2.fit(test_scale)
  225. # test_scale = []
  226. # for i in src2sloccount:
  227. # test_scale.append(src2sloccount[i][0])
  228. # scaler3.fit(test_scale)
  229. total_trainX = []
  230. total_trainY = []
  231. flag = True
  232. ###################################################################################################
  233. for i in range(models_num):
  234. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  235. for pkg_name in src2month:
  236. pkg_num = len(src2month)
  237. dataset = numpy.array(src2month[pkg_name])
  238. dataset = dataset[:len(dataset) - 2]
  239. dataset = dataset.reshape(-1, 1)
  240. scaler.fit(dataset)
  241. dataset = dataset.flatten()
  242. #
  243. original_dataset = dataset
  244. dataset = pandas.rolling_mean(dataset, window=smoothing)
  245. original_dataset = original_dataset[smoothing:]
  246. dataset = dataset[smoothing:]
  247. dataset = dataset.reshape(-1, 1)
  248. dataset = scaler.transform(dataset)
  249. dataset = dataset.flatten()
  250. total_sum = sum(original_dataset)
  251. original_dataset = original_dataset.reshape(-1, 1)
  252. original_dataset = scaler.transform(original_dataset)
  253. original_dataset = original_dataset.flatten()
  254. #print(dataset.shape)
  255. #print(len(dataset))
  256. if (total_sum > 10):
  257. # reset or not between training
  258. model.set_weights(Wsave)
  259. ## ommit for rolling mean
  260. # normalize the dataset
  261. train_size = len(dataset) - init_test_size
  262. test_size = len(dataset) - train_size
  263. train_original, test_original = original_dataset[0:train_size], original_dataset[
  264. train_size:len(dataset)]
  265. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  266. print(len(train), len(test))
  267. # get metadata
  268. meta = []
  269. # try:
  270. # pop_vote = src2pop[pkg_name][1]
  271. # except KeyError:
  272. # pop_vote = 0
  273. # try:
  274. # slocs_total = src2sloccount[pkg_name][0]
  275. # except KeyError:
  276. # slocs_total = 0
  277. # pop_vote = scaler2.transform([[pop_vote]])
  278. # slocs_total = scaler3.transform([[slocs_total]])
  279. # meta.append(pop_vote)
  280. # meta.append(slocs_total)
  281. # reshape into X=t and Y=t+1
  282. trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
  283. testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
  284. # reshape input to be [samples, time steps, features]
  285. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  286. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  287. trainY.reshape(-1, 1)
  288. testY.reshape(-1, 1)
  289. # fit the LSTM network
  290. if do_train:
  291. for j in range(100):
  292. model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
  293. model.reset_states()
  294. try:
  295. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(
  296. i) + '.h5')
  297. except OSError:
  298. model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  299. # else:
  300. # try:
  301. # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  302. # except OSError:
  303. # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  304. # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  305. # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  306. ###################################################################################################
  307. # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
  308. target2 = open('results_paper' + str(num_steps) + '.txt', 'w')
  309. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  310. # for pkg_name in ['libpng']:
  311. reality_list = []
  312. prediction_lstm = []
  313. prediction_ave = []
  314. prediction_wave = []
  315. prediction_last = []
  316. num_packages = 0
  317. select_best = True
  318. fvalues = dict()
  319. calcf(fvalues, src2month)
  320. for pkg_name in src2month:
  321. dataset = numpy.array(src2month[pkg_name])
  322. dataset = dataset[:len(dataset) - 2]
  323. dataset = dataset.reshape(-1, 1)
  324. scaler.fit(dataset)
  325. dataset = dataset.flatten()
  326. #
  327. original_dataset = dataset
  328. dataset = pandas.rolling_mean(dataset, window=smoothing)
  329. original_dataset = original_dataset[smoothing:]
  330. dataset = dataset[smoothing:]
  331. dataset = dataset.reshape(-1, 1)
  332. dataset = scaler.transform(dataset)
  333. dataset = dataset.flatten()
  334. total_sum = sum(original_dataset)
  335. original_dataset = original_dataset.reshape(-1, 1)
  336. original_dataset = scaler.transform(original_dataset)
  337. original_dataset = original_dataset.flatten()
  338. if (total_sum > 10 and do_test):
  339. best_model = 0
  340. best_error = 100.0
  341. totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back)
  342. if select_best:
  343. for i in range(5):
  344. (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(
  345. pkg_name, src2month, i, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w,
  346. reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last)
  347. if (evaluationerror < best_error):
  348. best_model = i
  349. best_error = evaluationerror
  350. model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(
  351. best_model) + '.h5')
  352. model.save(
  353. './models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + 'best' + '.h5')
  354. K.clear_session()
  355. (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name,
  356. src2month,
  357. 'best',
  358. totalX,
  359. totalY,
  360. scaler,
  361. num_steps,
  362. smoothing,
  363. batch_num,
  364. lamda_w,
  365. reality_list,
  366. prediction_lstm,
  367. prediction_ave,
  368. prediction_wave,
  369. prediction_last)
  370. normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
  371. certainty = 1 - numpy.absolute(evaluationerror / normalizer_value)
  372. if (certainty < 0.1):
  373. certainty = 0.1
  374. print(str(normalizer_value))
  375. # Plot
  376. # plt.plot(totalY, color='blue')
  377. # plt.plot(totalPredict, color='red')
  378. # plt.show()
  379. # need pkg_name, prediction, certainity, fvalues
  380. # TODO: save in form packageName:prediction:errorComplement:initial_expectation
  381. target2.write(pkg_name + ':' + str(prediction) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n')
  382. K.clear_session()
  383. else:
  384. raw_av = src2month[pkg_name]
  385. reality = sum(src2month[pkg_name][-12:-3])
  386. i = 0
  387. max_value = 0
  388. min_value = 0
  389. for month in raw_av:
  390. if (month == 0):
  391. i += 1
  392. w_average = calc_waverage(numpy.array(raw_av[i:-13]), lamda_w)
  393. normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
  394. certainty = 0.95
  395. # TODO: save in form packageName:prediction:errorComplement:initial_expectation
  396. target2.write(pkg_name + ':' + str(w_average) + ':' + str(certainty) + ':' + str(fvalues[pkg_name]) + '\n')
  397. mean_error = math.sqrt(mean_squared_error(prediction_lstm, reality_list))
  398. mean_ave_error = math.sqrt(mean_squared_error(prediction_ave, reality_list))
  399. mean_wave_error = math.sqrt(mean_squared_error(prediction_wave, reality_list))
  400. mean_last_error = math.sqrt(mean_squared_error(prediction_last, reality_list))
  401. print(mean_error)
  402. print(mean_ave_error)
  403. print(mean_wave_error)
  404. print(mean_last_error)