lstm_reg.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras import backend as K
  6. from keras.models import Sequential
  7. from keras.layers import Dense
  8. from keras.layers import LSTM
  9. from keras.layers import Activation, Dropout
  10. from keras.models import load_model
  11. from sklearn.preprocessing import MinMaxScaler
  12. from sklearn.metrics import mean_squared_error
  13. numpy.random.seed(7)
  14. # convert an array of values into a dataset matrix
  15. # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
  16. def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
  17. dataX, dataY = [], []
  18. for i in range(len(dataset)-look_back-num_steps):
  19. a = []
  20. for j in range(i, i+look_back):
  21. #a.append([dataset[j]] + meta)
  22. a.append([dataset[j]])
  23. dataX.append(a)
  24. mean = 0
  25. for j in range(num_steps):
  26. mean += original_dataset[i+look_back+j]
  27. dataY.append(mean/num_steps)
  28. return numpy.array(dataX), numpy.array(dataY)
  29. ## Calculate weighted average for comparison
  30. def calc_waverage(raw_av, lamda_w):
  31. w_average = 0
  32. weights = 0
  33. if(raw_av.size == 0):
  34. w_average = 0
  35. return w_average
  36. else:
  37. jj = 0
  38. for j in raw_av:
  39. w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  40. weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  41. jj += 1
  42. try:
  43. w_average = w_average/weights
  44. except ZeroDivisionError:
  45. print('Error:', raw_av)
  46. return w_average
  47. def normalizer(src2month, pkg, smoothing, num_steps):
  48. time_series = src2month[pkg]
  49. time_series = pandas.rolling_mean(time_series, window=smoothing)
  50. time_series = time_series[smoothing:]
  51. #print(len(time_series))
  52. #print(time_series)
  53. i = 0
  54. for month in time_series:
  55. if(numpy.isclose(month,0)):
  56. #print(str(month))
  57. i += 1
  58. else:
  59. break
  60. #print(str(i))
  61. try:
  62. max_value = numpy.amax(time_series[i:-12])*9
  63. min_value = numpy.amin(time_series[i:-12])*9
  64. except ValueError:
  65. max_value = 0
  66. min_value = 0
  67. norm = max_value-min_value
  68. if(norm<1):
  69. norm = 1
  70. return(norm)
  71. def calcf(fvalues, src2month):
  72. vulns20152016 = dict()
  73. total_top = 0
  74. number_top = 20
  75. total_low = 0
  76. number_low = 0
  77. for pkg in src2month:
  78. vulns20152016[pkg] = sum(src2month[pkg][-36:-12])
  79. vulnslist = vulns20152016.items()
  80. newlist = sorted(vulnslist, key=lambda k: k[1], reverse=True)
  81. cutoff_value = newlist[number_top][1]
  82. print('Cutoff value: ' + str(cutoff_value))
  83. for pkg in vulns20152016:
  84. if vulns20152016[pkg]>cutoff_value:
  85. total_top += vulns20152016[pkg]
  86. elif vulns20152016[pkg]<cutoff_value and vulns20152016[pkg]>0:
  87. total_low += vulns20152016[pkg]
  88. number_low += 1
  89. average_top_9month = (total_top/number_top)*9/24
  90. r_top = 1.05*(average_top_9month/(9*30*4))
  91. average_low_9month = (total_low/number_low)*9/24
  92. r_low = 1.05*(average_low_9month/(9*30*4))
  93. for pkg in vulns20152016:
  94. if vulns20152016[pkg]>cutoff_value:
  95. fvalues[pkg] = 1-r_top
  96. else:
  97. fvalues[pkg] = 1-r_low
  98. def test_model(pkg_name, src2month, model_file, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last):
  99. model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(model_file) + '.h5')
  100. model.reset_states()
  101. totalPredict = model.predict(totalX[18:], batch_size = batch_num)
  102. #model.reset_states()
  103. totalPredict = scaler.inverse_transform(totalPredict)
  104. totalPredict = totalPredict.flatten()
  105. totalY = totalY[18:]
  106. totalY = totalY.reshape(-1,1)
  107. totalY = scaler.inverse_transform(totalY)
  108. totalY = totalY.flatten()
  109. trainPredict = totalPredict[:-10]
  110. evaluatePredict = totalPredict[-10]
  111. testPredict = totalPredict[-1]
  112. model.reset_states()
  113. evaluation = evaluatePredict*9
  114. if(evaluation<0):
  115. evaluation = 0
  116. prediction = testPredict*9
  117. if(prediction<0):
  118. prediction = 0
  119. evaluation_reality = sum(src2month[pkg_name][-21:-12])
  120. reality = sum(src2month[pkg_name][-12:-3])
  121. if(reality==0):
  122. normalizer = 1
  123. else:
  124. normalizer=reality
  125. evaluationerror = evaluation_reality - evaluation
  126. testerror = (reality - prediction)/normalizer
  127. print('#' * 80)
  128. print(pkg_name)
  129. print('prediction: ' + str(prediction))
  130. print('reality: ' + str(totalY[-1]*9) + ' = ' + str(reality))
  131. print('Normalized error: ' + str(testerror))
  132. print('Validation error: ' + str(evaluationerror))
  133. # Plot
  134. plt.plot(totalY, color='blue')
  135. plt.plot(totalPredict, color='red')
  136. plt.show()
  137. ## Calculate average for comparison
  138. raw_av = src2month[pkg_name]
  139. i = 0
  140. max_value = 0
  141. min_value = 0
  142. for month in raw_av:
  143. if(month == 0):
  144. i += 1
  145. average = sum(raw_av[i:-13])/len(raw_av[i:-13])
  146. average_error = (reality-average)/normalizer
  147. w_average = calc_waverage(raw_av[i:-13], lamda_w)
  148. w_average_error = (reality-w_average)/normalizer
  149. last = sum(raw_av[-22:-13])
  150. last_error = (reality-last)/normalizer
  151. print(average*9)
  152. print(w_average*9)
  153. print(last)
  154. print('#' * 80)
  155. reality_list.append(reality)
  156. prediction_lstm.append(prediction)
  157. prediction_ave.append(average)
  158. prediction_wave.append(w_average)
  159. prediction_last.append(last)
  160. #if(not numpy.isinf(testerror)):
  161. # if(testerror>1):
  162. # testerror=1.0
  163. # if(average_error>1):
  164. # average_error=1.0
  165. # if(w_average_error>1):
  166. # w_average_error=1.0
  167. # if(last_error>1):
  168. # last_error=1.0
  169. # total_error += numpy.absolute(testerror)
  170. # total_ave_error += numpy.absolute(average_error)
  171. # total_wave_error += numpy.absolute(w_average_error)
  172. # total_last_error += numpy.absolute(last_error)
  173. # num_packages += 1
  174. return(prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality)
  175. def predict(src2month, src2sloccount, src2pop, src2deps):
  176. ## Number of features
  177. feat_num = 1
  178. ## Model parameters
  179. do_train = False
  180. do_test = True
  181. models_num = 5
  182. num_steps = 9
  183. smoothing = num_steps
  184. num_neurons = 10
  185. look_back = 3
  186. lamda_w = 12
  187. init_test_size = 18
  188. pkg_num = len(src2month)
  189. trainXdict = dict()
  190. trainYdict = dict()
  191. testXdict = dict()
  192. testYdict = dict()
  193. train_size = int(len(src2month['linux']) - init_test_size)
  194. test_size = len(src2month['linux']) - train_size
  195. batch_num = train_size - num_steps - look_back - smoothing - 2
  196. print("batch_num:")
  197. print(batch_num)
  198. # create the LSTM network
  199. model = Sequential()
  200. model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True))
  201. # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
  202. # model.add(Dense(32, activation='relu'))
  203. # model.add(Dense(16, activation='relu'))
  204. model.add(Dense(1))
  205. model.compile(loss='mean_squared_error', optimizer='adam')
  206. Wsave = model.get_weights()
  207. scaler = MinMaxScaler(feature_range=(0, 1))
  208. #scaler2 = MinMaxScaler(feature_range=(0,1))
  209. #scaler3 = MinMaxScaler(feature_range=(0,1))
  210. test_scale = []
  211. # for i in src2month:
  212. # test_scale = numpy.concatenate((test_scale, src2month[i]))
  213. # for j in src2month[i]:
  214. # test_scale.append(src2month[i][j])
  215. #test_scale = []
  216. #for i in src2pop:
  217. # test_scale.append(src2pop[i][1])
  218. #scaler2.fit(test_scale)
  219. #test_scale = []
  220. #for i in src2sloccount:
  221. # test_scale.append(src2sloccount[i][0])
  222. #scaler3.fit(test_scale)
  223. total_trainX = []
  224. total_trainY = []
  225. flag = True
  226. ###################################################################################################
  227. for i in range(models_num):
  228. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  229. for pkg_name in src2month:
  230. pkg_num = len(src2month)
  231. dataset = src2month[pkg_name]
  232. dataset = dataset[:len(dataset)-2]
  233. dataset = dataset.reshape(-1,1)
  234. scaler.fit(dataset)
  235. dataset = dataset.flatten()
  236. #
  237. original_dataset = dataset
  238. dataset = pandas.rolling_mean(dataset, window=smoothing)
  239. original_dataset = original_dataset[smoothing:]
  240. dataset = dataset[smoothing:]
  241. dataset = dataset.reshape(-1,1)
  242. dataset = scaler.transform(dataset)
  243. dataset = dataset.flatten()
  244. total_sum = sum(original_dataset)
  245. original_dataset = original_dataset.reshape(-1,1)
  246. original_dataset = scaler.transform(original_dataset)
  247. original_dataset = original_dataset.flatten()
  248. print(dataset.shape)
  249. print(len(dataset))
  250. if (total_sum>80):
  251. # reset or not between training
  252. model.set_weights(Wsave)
  253. ## ommit for rolling mean
  254. # normalize the dataset
  255. train_size = len(dataset) - init_test_size
  256. test_size = len(dataset) - train_size
  257. train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)]
  258. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  259. print(len(train), len(test))
  260. # get metadata
  261. meta = []
  262. #try:
  263. # pop_vote = src2pop[pkg_name][1]
  264. #except KeyError:
  265. # pop_vote = 0
  266. #try:
  267. # slocs_total = src2sloccount[pkg_name][0]
  268. #except KeyError:
  269. # slocs_total = 0
  270. #pop_vote = scaler2.transform([[pop_vote]])
  271. #slocs_total = scaler3.transform([[slocs_total]])
  272. #meta.append(pop_vote)
  273. #meta.append(slocs_total)
  274. # reshape into X=t and Y=t+1
  275. trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
  276. testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
  277. # reshape input to be [samples, time steps, features]
  278. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  279. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  280. trainY.reshape(-1,1)
  281. testY.reshape(-1,1)
  282. # fit the LSTM network
  283. if do_train:
  284. for j in range (10000):
  285. model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
  286. model.reset_states()
  287. try:
  288. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(i) + '.h5')
  289. except OSError:
  290. model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  291. #else:
  292. # try:
  293. # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  294. # except OSError:
  295. # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  296. # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  297. # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  298. ###################################################################################################
  299. # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
  300. target2 = open('results_paper' + str(num_steps) + '.txt','w')
  301. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  302. # for pkg_name in ['libpng']:
  303. reality_list = []
  304. prediction_lstm = []
  305. prediction_ave = []
  306. prediction_wave = []
  307. prediction_last = []
  308. num_packages = 0
  309. select_best = False
  310. fvalues = dict()
  311. calcf(fvalues,src2month)
  312. for pkg_name in src2month:
  313. dataset = src2month[pkg_name]
  314. dataset = dataset[:len(dataset)-2]
  315. dataset = dataset.reshape(-1,1)
  316. scaler.fit(dataset)
  317. dataset = dataset.flatten()
  318. #
  319. original_dataset = dataset
  320. dataset = pandas.rolling_mean(dataset, window=smoothing)
  321. original_dataset = original_dataset[smoothing:]
  322. dataset = dataset[smoothing:]
  323. dataset = dataset.reshape(-1,1)
  324. dataset = scaler.transform(dataset)
  325. dataset = dataset.flatten()
  326. total_sum = sum(original_dataset)
  327. original_dataset = original_dataset.reshape(-1,1)
  328. original_dataset = scaler.transform(original_dataset)
  329. original_dataset = original_dataset.flatten()
  330. if (total_sum>100 and do_test):
  331. best_model = 0
  332. best_error = 100.0
  333. totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back)
  334. if select_best:
  335. for i in range(5):
  336. (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name, src2month, i, totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last)
  337. if(evaluationerror<best_error):
  338. best_model = i
  339. best_error = evaluationerror
  340. model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + str(best_model) + '.h5')
  341. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + 'best' + '.h5')
  342. K.clear_session()
  343. (prediction, reality, testerror, evaluationerror, evaluation, evaluation_reality) = test_model(pkg_name, src2month, 'best', totalX, totalY, scaler, num_steps, smoothing, batch_num, lamda_w, reality_list, prediction_lstm, prediction_ave, prediction_wave, prediction_last)
  344. normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
  345. certainty = 1-numpy.absolute(evaluationerror/normalizer_value)
  346. if(certainty<0.1):
  347. certainty=0.1
  348. print(str(normalizer_value))
  349. # Plot
  350. #plt.plot(totalY, color='blue')
  351. #plt.plot(totalPredict, color='red')
  352. #plt.show()
  353. target2.write(pkg_name + ':' + str(prediction) + ':' + str(reality) + ':' + str(certainty) + ':' + str(evaluationerror) + ':' + str(evaluation) + ':' + str(evaluation_reality) + ':' + str(fvalues[pkg_name]) + '\n')
  354. K.clear_session()
  355. else:
  356. raw_av = src2month[pkg_name]
  357. reality = sum(src2month[pkg_name][-12:-3])
  358. i = 0
  359. max_value = 0
  360. min_value = 0
  361. for month in raw_av:
  362. if(month == 0):
  363. i += 1
  364. w_average = calc_waverage(raw_av[i:-13], lamda_w)
  365. normalizer_value = normalizer(src2month, pkg_name, smoothing, num_steps)
  366. certainty = 0.95
  367. target2.write(pkg_name + ':' + str(w_average) + ':' + str(reality) + ':' + str(certainty) + ':' + 'na' + ':' + 'na' + ':' + 'na' + ':' + str(fvalues[pkg_name]) + '\n')
  368. mean_error = math.sqrt(mean_squared_error(prediction_lstm, reality_list))
  369. mean_ave_error = math.sqrt(mean_squared_error(prediction_ave, reality_list))
  370. mean_wave_error = math.sqrt(mean_squared_error(prediction_wave, reality_list))
  371. mean_last_error = math.sqrt(mean_squared_error(prediction_last, reality_list))
  372. print(mean_error)
  373. print(mean_ave_error)
  374. print(mean_wave_error)
  375. print(mean_last_error)