lstm_reg.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras.models import Sequential
  6. from keras.layers import Dense
  7. from keras.layers import LSTM
  8. from keras.layers import Activation, Dropout
  9. from keras.models import load_model
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.metrics import mean_squared_error
  12. numpy.random.seed(7)
  13. # convert an array of values into a dataset matrix
  14. # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
  15. def create_dataset(original_dataset, dataset, meta, num_steps, look_back=1):
  16. dataX, dataY = [], []
  17. for i in range(len(dataset)-look_back-1-num_steps):
  18. a = []
  19. for j in range(i, i+look_back):
  20. #a.append([dataset[j]] + meta)
  21. a.append([dataset[j]])
  22. dataX.append(a)
  23. mean = 0
  24. for j in range(num_steps):
  25. mean += original_dataset[i+look_back+j]
  26. dataY.append(mean/num_steps)
  27. return numpy.array(dataX), numpy.array(dataY)
  28. ## Calculate weighted average for comparison
  29. def calc_waverage(raw_av, lamda_w):
  30. w_average = 0
  31. weights = 0
  32. if(len(raw_av) == 0):
  33. w_average = 0
  34. else:
  35. jj = 0
  36. for j in raw_av:
  37. w_average += j * math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  38. weights += math.exp(-(len(raw_av) - jj - 1)/lamda_w)
  39. jj += 1
  40. w_average = w_average/weights
  41. return w_average
  42. def predict(src2month, src2sloccount, src2pop, src2deps):
  43. ## Number of features
  44. feat_num = 1
  45. ## Model parameters
  46. do_train = False
  47. num_steps = 9
  48. smoothing = num_steps
  49. num_neurons = 10
  50. look_back = 3
  51. train_flag = True
  52. test_flag = True
  53. lamda_w = 12
  54. init_test_size = 18
  55. pkg_num = len(src2month)
  56. training_num = len(src2month['linux']-4)
  57. trainXdict = dict()
  58. trainYdict = dict()
  59. testXdict = dict()
  60. testYdict = dict()
  61. train_size = int(len(src2month['linux']) - init_test_size)
  62. test_size = len(src2month['linux']) - train_size
  63. batch_num = train_size - num_steps - look_back - smoothing - 5
  64. print("batch_num:")
  65. print(batch_num)
  66. # create the LSTM network
  67. model = Sequential()
  68. model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True))
  69. # model.add((keras.layers.0, recurrent_dropout=0.4, implementation=1, return_sequences=False, return_state=False, go_backwards=False, stateful=True, unroll=False))
  70. # model.add(Dense(32, activation='relu'))
  71. # model.add(Dense(16, activation='relu'))
  72. model.add(Dense(1))
  73. model.compile(loss='mean_squared_error', optimizer='adam')
  74. Wsave = model.get_weights()
  75. scaler = MinMaxScaler(feature_range=(0, 1))
  76. #scaler2 = MinMaxScaler(feature_range=(0,1))
  77. #scaler3 = MinMaxScaler(feature_range=(0,1))
  78. test_scale = []
  79. # for i in src2month:
  80. # test_scale = numpy.concatenate((test_scale, src2month[i]))
  81. # for j in src2month[i]:
  82. # test_scale.append(src2month[i][j])
  83. #test_scale = []
  84. #for i in src2pop:
  85. # test_scale.append(src2pop[i][1])
  86. #scaler2.fit(test_scale)
  87. #test_scale = []
  88. #for i in src2sloccount:
  89. # test_scale.append(src2sloccount[i][0])
  90. #scaler3.fit(test_scale)
  91. total_trainX = []
  92. total_trainY = []
  93. flag = True
  94. ###################################################################################################
  95. # for pkg_name in ['chromium-browser']:
  96. for i in range(1):
  97. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  98. for pkg_name in src2month:
  99. pkg_num = len(src2month)
  100. dataset = src2month[pkg_name]
  101. dataset = dataset[:len(dataset)-4]
  102. print(dataset.shape)
  103. dataset = dataset.reshape(-1,1)
  104. scaler.fit(dataset)
  105. print(dataset.shape)
  106. dataset = dataset.flatten()
  107. print(dataset.shape)
  108. print(len(dataset))
  109. #
  110. if (sum(dataset)>30):
  111. # reset or not between training
  112. model.set_weights(Wsave)
  113. original_dataset = dataset
  114. dataset = pandas.rolling_mean(dataset, window=smoothing)
  115. ## ommit for rolling mean
  116. original_dataset = original_dataset[smoothing:]
  117. dataset = dataset[smoothing:]
  118. # normalize the dataset
  119. dataset = dataset.reshape(-1,1)
  120. dataset = scaler.transform(dataset)
  121. dataset = dataset.flatten()
  122. original_dataset = original_dataset.reshape(-1,1)
  123. original_dataset = scaler.transform(original_dataset)
  124. original_dataset = original_dataset.flatten()
  125. train_size = len(dataset) - init_test_size
  126. test_size = len(dataset) - train_size
  127. train_original, test_original = original_dataset[0:train_size], original_dataset[train_size:len(dataset)]
  128. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  129. print(len(train), len(test))
  130. # get metadata
  131. meta = []
  132. #try:
  133. # pop_vote = src2pop[pkg_name][1]
  134. #except KeyError:
  135. # pop_vote = 0
  136. #try:
  137. # slocs_total = src2sloccount[pkg_name][0]
  138. #except KeyError:
  139. # slocs_total = 0
  140. #pop_vote = scaler2.transform([[pop_vote]])
  141. #slocs_total = scaler3.transform([[slocs_total]])
  142. #meta.append(pop_vote)
  143. #meta.append(slocs_total)
  144. # reshape into X=t and Y=t+1
  145. trainX, trainY = create_dataset(train_original, train, meta, num_steps, look_back)
  146. testX, testY = create_dataset(test_original, test, meta, num_steps, look_back)
  147. # reshape input to be [samples, time steps, features]
  148. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  149. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  150. trainY.reshape(-1,1)
  151. testY.reshape(-1,1)
  152. #if len(total_trainX) == 0:
  153. # total_trainX = trainX
  154. # total_trainY = trainY
  155. #else:
  156. # total_trainX = numpy.concatenate((total_trainX, trainX))
  157. # total_trainY = numpy.concatenate((total_trainY, trainY))
  158. # save to dict for later
  159. trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
  160. testXdict[pkg_name], testYdict[pkg_name] = testX, testY
  161. # fit the LSTM network
  162. if do_train:
  163. for i in range (20000):
  164. model.fit(trainX, trainY, nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
  165. model.reset_states()
  166. try:
  167. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  168. except OSError:
  169. model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  170. #else:
  171. # try:
  172. # model.save('./moels/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  173. # except OSError:
  174. # model.save('./models/unknown-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  175. # model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  176. # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  177. ###################################################################################################
  178. # target = open('output-Errors-ALLPACKAGES-NEW' + str(num_steps) + 'smoothing' + str(smoothing) + 'neurons' + str(num_neurons) + '.txt','w')
  179. target2 = open('results_paper' + str(num_steps) + '.txt','w')
  180. # for pkg_name in ['chromium-browser', 'firefox-esr', 'linux']:
  181. # for pkg_name in ['libpng']:
  182. for pkg_name in src2month:
  183. dataset = src2month[pkg_name]
  184. dataset = dataset[:len(dataset)-4]
  185. dataset = dataset.reshape(-1,1)
  186. scaler.fit(dataset)
  187. dataset = dataset.flatten()
  188. if (sum(dataset)>30 and test_flag):
  189. dataset = pandas.rolling_mean(dataset, window=smoothing)
  190. #if (sum(dataset)>80):
  191. model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  192. #else:
  193. # model = load_model('./models/low_together' + '-' + str(num_steps) + 'smoothing' + str(smoothing) + '.h5')
  194. dataset = dataset[smoothing:]
  195. model.reset_states()
  196. trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
  197. testX, testY = testXdict[pkg_name], testYdict[pkg_name]
  198. print(trainX.shape, trainY.shape, testX.shape, testY.shape)
  199. #
  200. # normalize the dataset
  201. dataset = dataset.reshape(-1,1)
  202. dataset = scaler.transform(dataset)
  203. dataset = dataset.flatten()
  204. print(numpy.shape(trainX), numpy.shape(testX), numpy.shape(dataset))
  205. # make predictions
  206. trainPredict = model.predict(trainX, batch_size = batch_num)
  207. model.reset_states()
  208. test_dataset = numpy.concatenate((trainX[len(testX):], testX))
  209. testPredict = model.predict(test_dataset, batch_size = batch_num)
  210. model.reset_states()
  211. # invert predictions
  212. testPredict = testPredict[-len(testX):]
  213. trainPredict = trainPredict.reshape(-1,1)
  214. trainPredict = scaler.inverse_transform(trainPredict)
  215. trainPredict = trainPredict.flatten()
  216. trainY = trainY.reshape(-1,1)
  217. trainY = scaler.inverse_transform(trainY)
  218. trainY = trainY.flatten()
  219. testPredict = testPredict.reshape(-1,1)
  220. testPredict = scaler.inverse_transform(testPredict)
  221. testPredict = testPredict.flatten()
  222. print('shapes coming')
  223. print(testY.shape)
  224. testY = testY.reshape(-1,1)
  225. testY = scaler.inverse_transform(testY)
  226. testY = testY.flatten()
  227. print(testY.shape)
  228. ## Calculate average for comparison
  229. raw_av = src2month[pkg_name]
  230. raw_av = raw_av[:len(dataset)-4]
  231. i = 0
  232. averagear = numpy.empty_like(trainY)
  233. w_averagear = numpy.empty_like(trainY)
  234. averagear[0] = 0
  235. w_averagear[0] = 0
  236. for i in range(1,len(averagear)):
  237. averagear[i] = sum(raw_av[:i])/float(i)
  238. w_averagear[i] = calc_waverage(raw_av[:i], lamda_w)
  239. averagear_test = numpy.empty_like(testY)
  240. w_averagear_test = numpy.empty_like(testY)
  241. for i in range(0, len(averagear_test)):
  242. averagear_test[i] = sum(raw_av[:len(averagear) + i])/float(len(averagear) + i)
  243. w_averagear_test[i] = calc_waverage(raw_av[:len(averagear+i)], lamda_w)
  244. print(pkg_name)
  245. #print('average: ' + str(num_steps*average) + '\nweighted average: ' + str(num_steps*w_average))
  246. ## calculate root mean squared error
  247. # LSTM
  248. trainScore = math.sqrt(mean_squared_error(trainY, trainPredict))
  249. #print('Train Score: %.2f RMSE' % (trainScore))
  250. testScore = math.sqrt(mean_squared_error(testY, testPredict))
  251. #print('Test Score: %.2f RMSE' % (testScore))
  252. # Average
  253. trainScore_average = math.sqrt(mean_squared_error(trainY, averagear))
  254. testScore_average = math.sqrt(mean_squared_error(testY, averagear_test))
  255. trainScore_w_average = math.sqrt(mean_squared_error(trainY, w_averagear))
  256. testScore_w_average = math.sqrt(mean_squared_error(testY, w_averagear_test))
  257. # Immitation
  258. imit_train = numpy.copy(trainY)
  259. imit_train = imit_train[:-1]
  260. #print(imit_train)
  261. trainY_im = trainY[1:]
  262. #print(trainY_im)
  263. imit_test = numpy.copy(testY)
  264. imit_test = imit_test[:-1]
  265. testY_im = testY[1:]
  266. trainScore_imit = math.sqrt(mean_squared_error(trainY_im, imit_train))
  267. testScore_imit = math.sqrt(mean_squared_error(testY_im, imit_test))
  268. # Calculate nrmse for certainty
  269. nmax_train = numpy.amax(trainY[-24:])
  270. nmin_train = numpy.amin(trainY[-24:])
  271. nmax_test = numpy.amax(testY)
  272. nmin_test = numpy.amin(testY)
  273. nmax = 0
  274. nmin = 0
  275. if(nmax_train > nmax_test):
  276. nmax = nmax_train
  277. else:
  278. nmax = nmax_test
  279. if(nmin_train < nmin_test):
  280. nmin = nmin_train
  281. else:
  282. nmin = nmin_test
  283. normalizer = nmax - nmin
  284. print('nmax: ' + str(nmax) + ' , nmin: ' + str(nmin) + ' , normalizer: ' + str(normalizer))
  285. # plot baseline and predictions
  286. #print(numpy.shape(trainY), numpy.shape(testY))
  287. #print(numpy.shape(trainPredict), numpy.shape(testPredict))
  288. real_values = numpy.concatenate((trainY, testY), axis=0)
  289. training_fit = numpy.empty_like(real_values)
  290. training_fit[:] = numpy.nan
  291. training_fit[:len(trainPredict)] = trainPredict
  292. prediction = numpy.empty_like(real_values)
  293. prediction[:] = numpy.nan
  294. prediction[len(trainPredict):] = testPredict
  295. print('Actual number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(real_values[-1]*num_steps))
  296. act = real_values[-1]*num_steps
  297. print('Predicted number of vulnerabilities - next ' + str(num_steps) + ' months :' + str(prediction[-1]*num_steps))
  298. lstm_pred = prediction[-1]*num_steps
  299. av_pred = averagear_test[-1]*num_steps
  300. w_av_pred = w_averagear_test[-1]*num_steps
  301. #print(real_values)
  302. plt.plot(real_values)
  303. plt.plot(training_fit)
  304. plt.plot(prediction)
  305. plt.show()
  306. ## Caclulate better predictor
  307. best = -1
  308. min_test_error = min(testScore, testScore_average, testScore_w_average)
  309. if min_test_error == testScore:
  310. best = 0
  311. elif min_test_error == testScore_average:
  312. best = 1
  313. elif min_test_error == testScore_w_average:
  314. best = 2
  315. print("LSTM rmse: " + str(testScore))
  316. print("Average rmse: " + str(testScore_average))
  317. print("Weighted average rmse: " + str(testScore_w_average))
  318. # ## Write to file
  319. #target.write(pkg_name + ', ' + str(prediction[-1]*num_steps) + ', ' +str(real_values[-1]*num_steps) + ', ' + str(nrmse_train) + ', ' + str(nrmse_test) + ', ' + str(num_steps*average) + ', ' + str(num_steps*w_average) + ', ' + str(best) + '\n')
  320. # if(actual != 0):
  321. # norm_diff = abs(actual-predicted)/actual
  322. # else:
  323. # norm_diff = actual-predicted
  324. target2.write(pkg_name + ',' + str(normalizer) + ',' + str(testScore/normalizer) + ',' + str(testScore_average/normalizer) + ',' + str(testScore_w_average/normalizer) + ',' + str(best) + ', ' + str(len(trainY)) + ', ' + str(len(testY)) + ', ' + str(act) + ', ' + str(lstm_pred) + ', ' + str(av_pred) + ', ' + str(w_av_pred) + '\n')