lstm_reg.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras.models import Sequential
  6. from keras.layers import Dense
  7. from keras.layers import LSTM
  8. from keras.layers import Activation, Dropout
  9. from keras.models import load_model
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.metrics import mean_squared_error
  12. numpy.random.seed(7)
  13. # convert an array of values into a dataset matrix
  14. # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
  15. def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
  16. dataX, dataY = [], []
  17. for i in range(len(dataset)-look_back-num_steps):
  18. a = []
  19. for j in range(i, i+look_back):
  20. #a.append([dataset[j]] + meta)
  21. a.append([dataset[j]])
  22. dataX.append(a)
  23. mean = 0
  24. for j in range(num_steps):
  25. mean += original_dataset[i+look_back+j]
  26. dataY.append(mean/num_steps)
  27. return numpy.array(dataX), numpy.array(dataY)
  28. ## Calculate weighted average for comparison
  29. def calc_waverage(raw_av, lamda_w):
  30. w_average = 0
  31. weights = 0
  32. if(len(raw_av) == 0):
  33. w_average = 0
  34. else:
  35. jj = 0
  36. for j in raw_av:
  37. w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  38. weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  39. jj += 1
  40. w_average = w_average/weights
  41. return w_average
  42. def predict(src2month, src2sloccount, src2pop, src2deps):
  43. ## Number of features
  44. feat_num = 1
  45. ## Model parameters
  46. do_train = False
  47. num_steps = 9
  48. smoothing = num_steps
  49. num_neurons = 10
  50. look_back = 4
  51. train_flag = True
  52. test_flag = True
  53. lamda_w = 12
  54. init_test_size = 18
  55. pkg_num = len(src2month)
  56. training_num = len(src2month['linux']-3)
  57. trainXdict = dict()
  58. trainYdict = dict()
  59. testXdict = dict()
  60. testYdict = dict()
  61. train_size = int(len(src2month['linux']) - init_test_size)
  62. test_size = len(src2month['linux']) - train_size
  63. batch_num = train_size - num_steps - look_back - smoothing - 3
  64. print("batch_num:")
  65. print(batch_num)
  66. # create the LSTM network
  67. model = Sequential()
  68. model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True))
  69. # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
  70. # model.add(Dense(32, activation='relu'))
  71. # model.add(Dense(16, activation='relu'))
  72. model.add(Dense(1))
  73. model.compile(loss='mean_squared_error', optimizer='adam')
  74. Wsave = model.get_weights()
  75. scaler = MinMaxScaler(feature_range=(0, 1))
  76. #scaler2 = MinMaxScaler(feature_range=(0,1))
  77. #scaler3 = MinMaxScaler(feature_range=(0,1))
  78. test_scale = []
  79. # for i in src2month:
  80. # test_scale = numpy.concatenate((test_scale, src2month[i]))
  81. # for j in src2month[i]:
  82. # test_scale.append(src2month[i][j])
  83. #test_scale = []
  84. #for i in src2pop:
  85. # test_scale.append(src2pop[i][1])
  86. #scaler2.fit(test_scale)
  87. #test_scale = []
  88. #for i in src2sloccount:
  89. # test_scale.append(src2sloccount[i][0])
  90. #scaler3.fit(test_scale)
  91. total_trainX = []
  92. total_trainY = []
  93. flag = True
  94. ###################################################################################################
  95. # for pkg_name in ['chromium-browser']:
  96. for i in range(1):
  97. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  98. for pkg_name in src2month:
  99. pkg_num = len(src2month)
  100. dataset = src2month[pkg_name]
  101. dataset = dataset[:len(dataset)-3]
  102. print(dataset.shape)
  103. dataset = dataset.reshape(-1,1)
  104. scaler.fit(dataset)
  105. print(dataset.shape)
  106. dataset = dataset.flatten()
  107. print(dataset.shape)
  108. print(len(dataset))
  109. #
  110. if (sum(dataset)>70):
  111. # reset or not between training
  112. model.set_weights(Wsave)
  113. original_dataset = dataset
  114. dataset = pandas.rolling_mean(dataset, window=smoothing)
  115. ## ommit for rolling mean
  116. original_dataset = original_dataset[smoothing:]
  117. dataset = dataset[smoothing:]
  118. # normalize the dataset
  119. dataset = dataset.reshape(-1,1)
  120. dataset = scaler.transform(dataset)
  121. dataset = dataset.flatten()
  122. original_dataset = original_dataset.reshape(-1,1)
  123. original_dataset = scaler.transform(original_dataset)
  124. original_dataset = original_dataset.flatten()
  125. train_size = len(dataset) - init_test_size
  126. test_size = len(dataset) - train_size
  127. train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)]
  128. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  129. print(len(train), len(test))
  130. # get metadata
  131. meta = []
  132. #try:
  133. # pop_vote = src2pop[pkg_name][1]
  134. #except KeyError:
  135. # pop_vote = 0
  136. #try:
  137. # slocs_total = src2sloccount[pkg_name][0]
  138. #except KeyError:
  139. # slocs_total = 0
  140. #pop_vote = scaler2.transform([[pop_vote]])
  141. #slocs_total = scaler3.transform([[slocs_total]])
  142. #meta.append(pop_vote)
  143. #meta.append(slocs_total)
  144. # reshape into X=t and Y=t+1
  145. trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
  146. testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
  147. # reshape input to be [samples, time steps, features]
  148. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  149. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  150. trainY.reshape(-1,1)
  151. testY.reshape(-1,1)
  152. #if len(total_trainX) == 0:
  153. # total_trainX = trainX
  154. # total_trainY = trainY
  155. #else:
  156. # total_trainX = numpy.concatenate((total_trainX, trainX))
  157. # total_trainY = numpy.concatenate((total_trainY, trainY))
  158. # save to dict for later
  159. trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
  160. testXdict[pkg_name], testYdict[pkg_name] = testX, testY
  161. # fit the LSTM network
  162. if do_train:
  163. for i in range (4000):
  164. model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
  165. model.reset_states()
  166. try:
  167. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  168. except OSError:
  169. model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  170. #else:
  171. # try:
  172. # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  173. # except OSError:
  174. # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  175. # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  176. # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  177. ###################################################################################################
  178. # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
  179. target2 = open('results_paper' + str(num_steps) + '.txt','w')
  180. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  181. # for pkg_name in ['libpng']:
  182. for pkg_name in src2month:
  183. dataset = src2month[pkg_name]
  184. dataset = dataset[:len(dataset)-3]
  185. original_dataset = dataset
  186. dataset = dataset.reshape(-1,1)
  187. scaler.fit(dataset)
  188. dataset = dataset.flatten()
  189. original_dataset = original_dataset[smoothing:]
  190. original_dataset = original_dataset.reshape(-1,1)
  191. original_dataset = scaler.transform(original_dataset)
  192. original_dataset = original_dataset.flatten()
  193. if (sum(dataset)>90 and test_flag):
  194. dataset = pandas.rolling_mean(dataset, window=smoothing)
  195. #if (sum(dataset)>80):
  196. model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  197. #else:
  198. # model = load_model('./models/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  199. dataset = dataset[smoothing:]
  200. # normalize the dataset
  201. dataset = dataset.reshape(-1,1)
  202. dataset = scaler.transform(dataset)
  203. dataset = dataset.flatten()
  204. model.reset_states()
  205. totalX, totalY = create_dataset(original_dataset, dataset, meta, num_steps, look_back)
  206. #trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
  207. #testX, testY = testXdict[pkg_name], testYdict[pkg_name]
  208. #print(trainX.shape, trainY.shape, testX.shape, testY.shape)
  209. #
  210. #print(numpy.shape(trainX), numpy.shape(testX), numpy.shape(dataset))
  211. # make predictions
  212. totalPredict = model.predict(totalX[18:], batch_size = batch_num)
  213. #model.reset_states()
  214. #test_dataset = numpy.concatenate((trainX[len(testX):], testX))
  215. #testPredict = model.predict(test_dataset, batch_size = batch_num)
  216. trainPredict = totalPredict[:-10]
  217. evaluatePredict = totalPredict[-10]
  218. testPredict = totalPredict[-1]
  219. model.reset_states()
  220. # invert predictions
  221. #testPredict = testPredict[-len(testX):]
  222. trainPredict = trainPredict.reshape(-1,1)
  223. trainPredict = scaler.inverse_transform(trainPredict)
  224. trainPredict = trainPredict.flatten()
  225. evaluatePredict = evaluatePredict.reshape(-1,1)
  226. evaluatePredict = scaler.inverse_transform(evaluatePredict)
  227. evaluatePredict = evaluatePredict.flatten()
  228. #trainY = trainY.reshape(-1,1)
  229. #trainY = scaler.inverse_transform(trainY)
  230. #trainY = trainY.flatten()
  231. testPredict = testPredict.reshape(-1,1)
  232. testPredict = scaler.inverse_transform(testPredict)
  233. testPredict = testPredict.flatten()
  234. prediction = testPredict[0]*9
  235. reality = sum(src2month[pkg_name][-13:-4])
  236. testerror = (reality - prediction)/reality
  237. print(pkg_name)
  238. print('prediction: ' + str(prediction))
  239. print('reality: ' + str(scaler.inverse_transform(totalY[-1])[0][0]*9) + ' = ' + str(reality))
  240. print('Normalized error: ' + str(testerror))
  241. print('#' * 20)
  242. #print('shapes coming')
  243. #print(testY.shape)
  244. #testY = testY.reshape(-1,1)
  245. #testY = scaler.inverse_transform(testY)
  246. #testY = testY.flatten()
  247. #print(testY.shape)
  248. ## Calculate average for comparison
  249. raw_av = src2month[pkg_name]
  250. #raw_av = raw_av[:len(dataset)-4]
  251. #i = 0
  252. #averagear = numpy.empty_like(trainY)
  253. #w_averagear = numpy.empty_like(trainY)
  254. #averagear[0] = 0
  255. #w_averagear[0] = 0
  256. #for i in range(1,len(averagear)):
  257. # averagear[i] = sum(raw_av[:i])/float(i)
  258. # w_averagear[i] = calc_waverage(raw_av[:i], lamda_w)
  259. #averagear_test = numpy.empty_like(testY)
  260. #w_averagear_test = numpy.empty_like(testY)
  261. #for i in range(0, len(averagear_test)):
  262. # averagear_test[i] = sum(raw_av[:len(averagear) + i])/float(len(averagear) + i)
  263. # w_averagear_test[i] = calc_waverage(raw_av[:len(averagear+i)], lamda_w)
  264. #print('average: ' + str(num_steps*average) + '\nweighted average: ' + str(num_steps*w_average))
  265. ## calculate root mean squared error
  266. # LSTM
  267. #trainScore = math.sqrt(mean_squared_error(trainY[-9:], trainPredict[-9:]))
  268. #print('Train Score: %.2f RMSE' % (trainScore))
  269. #testScore = math.sqrt(mean_squared_error(testY, testPredict))
  270. #print('Test Score: %.2f RMSE' % (testScore))
  271. # Average
  272. #trainScore_average = math.sqrt(mean_squared_error(trainY[-9:], averagear[-9:]))
  273. #testScore_average = math.sqrt(mean_squared_error(testY, averagear_test))
  274. #trainScore_w_average = math.sqrt(mean_squared_error(trainY[-9:], w_averagear[-9:]))
  275. #testScore_w_average = math.sqrt(mean_squared_error(testY, w_averagear_test))
  276. # Immitation
  277. #imit_train = numpy.copy(trainY)
  278. #imit_train = imit_train[:-1]
  279. #print(imit_train)
  280. #trainY_im = trainY[1:]
  281. #print(trainY_im)
  282. #imit_test = numpy.copy(testY)
  283. #imit_test = imit_test[:-1]
  284. #testY_im = testY[1:]
  285. #trainScore_imit = math.sqrt(mean_squared_error(trainY_im, imit_train))
  286. #testScore_imit = math.sqrt(mean_squared_error(testY_im, imit_test))
  287. # Calculate nrmse for certainty
  288. #nmax_train = numpy.amax(trainY[-24:])
  289. #nmin_train = numpy.amin(trainY[-24:])
  290. #nmax_test = numpy.amax(testY)
  291. #nmin_test = numpy.amin(testY)
  292. #nmax = 0
  293. #nmin = 0
  294. #if(nmax_train > nmax_test):
  295. # nmax = nmax_train
  296. #else:
  297. # nmax = nmax_test
  298. #if(nmin_train < nmin_test):
  299. # nmin = nmin_train
  300. #else:
  301. # nmin = nmin_test
  302. #normalizer = nmax - nmin
  303. #print('nmax: ' + str(nmax) + ' , nmin: ' + str(nmin) + ' , normalizer: ' + str(normalizer))
  304. # plot baseline and predictions
  305. #print(numpy.shape(trainY), numpy.shape(testY))
  306. #print(numpy.shape(trainPredict), numpy.shape(testPredict))
  307. #real_values = numpy.concatenate((trainY, testY), axis=0)
  308. #training_fit = numpy.empty_like(real_values)
  309. #training_fit[:] = numpy.nan
  310. #training_fit[:len(trainPredict)] = trainPredict
  311. #prediction = numpy.empty_like(real_values)
  312. #prediction[:] = numpy.nan
  313. #prediction[len(trainPredict):] = testPredict
  314. #print('Actual number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(real_values[-1]*num_steps))
  315. #act = real_values[-1]*num_steps
  316. #print('Predicted number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(prediction[-1]*num_steps))
  317. #lstm_pred = prediction[-1]*num_steps
  318. #av_pred = averagear_test[-1]*num_steps
  319. #w_av_pred = w_averagear_test[-1]*num_steps
  320. #print(real_values)
  321. #plt.plot(real_values)
  322. #plt.plot(training_fit)
  323. #plt.plot(prediction)
  324. #plt.show()
  325. ## Caclulate better predictor
  326. #best = -1
  327. #min_test_error = min(testScore, testScore_average, testScore_w_average)
  328. #if min_test_error == testScore:
  329. # best = 0
  330. #elif min_test_error == testScore_average:
  331. # best = 1
  332. #elif min_test_error == testScore_w_average:
  333. # best = 2
  334. #print("LSTM rmse: " + str(testScore))
  335. #print("Average rmse: " + str(testScore_average))
  336. #print("Weighted average rmse: " + str(testScore_w_average))
  337. # ## Write to file
  338. #target.write(pkg_name + ', ' + str(prediction[-1]*num_steps) + ', ' +str(real_values[-1]*num_steps) + ', ' + str(nrmse_train) + ', ' + str(nrmse_test) + ', ' + str(num_steps*average) + ', ' + str(num_steps*w_average) + ', ' + str(best) + '\n')
  339. # if(actual != 0):
  340. # norm_diff = abs(actual-predicted)/actual
  341. # else:
  342. # norm_diff = actual-predicted
  343. #target2.write(pkg_name + ',' + str(normalizer) + ',' + str(trainScore/normalizer) + ',' + str(trainScore_average/normalizer) + ',' + str(trainScore_w_average/normalizer) + ',' + str(best) + ', ' + str(len(trainY)) + ', ' + str(len(testY)) + ', ' + str(act) + ', ' + str(lstm_pred) + ', ' + str(av_pred) + ', ' + str(w_av_pred) + '\n')