lstm_reg2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras.models import Sequential
  6. from keras.layers import Dense
  7. from keras.layers import LSTM
  8. from keras.layers import Activation, Dropout
  9. from keras.models import load_model
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.metrics import mean_squared_error
  12. numpy.random.seed(7)
  13. # convert an array of values into a dataset matrix
  14. def create_dataset(dataset, meta, look_back=1):
  15. dataX, dataY = [], []
  16. for i in range(len(dataset)-look_back-1):
  17. a = []
  18. for j in range(i, i+look_back):
  19. #a.append([dataset[j]] + meta)
  20. a.append([dataset[j]])
  21. dataX.append(a)
  22. dataY.append(dataset[i + look_back])
  23. return numpy.array(dataX), numpy.array(dataY)
  24. def predict(src2month, src2sloccount, src2pop, src2deps):
  25. ## Number of features
  26. feat_num = 1
  27. ## Number of steps in the future to predict - also affects smoothing
  28. num_steps = 6
  29. pkg_num = len(src2month)
  30. training_num = len(src2month['linux']-8)
  31. trainXdict = dict()
  32. trainYdict = dict()
  33. testXdict = dict()
  34. testYdict = dict()
  35. look_back = 10
  36. # create the LSTM network
  37. model = Sequential()
  38. model.add(LSTM(32, input_shape = (look_back, feat_num) , activation ='relu', dropout_W=0.5))
  39. # model.add(LSTM(16, return_sequences=True))
  40. # model.add(LSTM(8))
  41. model.add(Dense(64, activation='relu'))
  42. model.add(Dropout(0.5))
  43. # model.add(Dense(64, activation='relu'))
  44. # model.add(Dropout(0.5))
  45. # model.add(Dense(64, activation='relu'))
  46. # model.add(Dropout(0.5))
  47. # model.add(LSTM(4, input_dim=look_back-6, dropout_W = 0.2, dropout_U = 0.1))
  48. model.add(Dense(1), activation='linear')
  49. model.compile(loss='mean_squared_error', optimizer='adam')
  50. scaler = MinMaxScaler(feature_range=(0, 1))
  51. # create the additional NN
  52. # model2 = Sequential()
  53. # model2.add
  54. scaler2 = MinMaxScaler(feature_range=(0,1))
  55. scaler3 = MinMaxScaler(feature_range=(0,1))
  56. test_scale = []
  57. for i in src2month:
  58. test_scale = numpy.concatenate((test_scale, src2month[i]))
  59. # for j in src2month[i]:
  60. # test_scale.append(src2month[i][j])
  61. scaler.fit(test_scale)
  62. test_scale = []
  63. for i in src2pop:
  64. test_scale.append(src2pop[i][1])
  65. scaler2.fit(test_scale)
  66. test_scale = []
  67. for i in src2sloccount:
  68. test_scale.append(src2sloccount[i][0])
  69. scaler3.fit(test_scale)
  70. total_trainX = []
  71. total_trainY = []
  72. flag = True
  73. ###################################################################################################
  74. for i in range(0,1):
  75. for pkg_name in src2month:
  76. # for pkg_name in ['chromium-browser', 'linux']:
  77. # for pkg_name in ['linux']:
  78. pkg_num = len(src2month)
  79. dataset = src2month[pkg_name]
  80. dataset = dataset[:len(dataset)-8]
  81. print(len(dataset))
  82. #
  83. if sum(dataset)>50:
  84. #
  85. dataset = pandas.rolling_mean(dataset, window=num_steps)
  86. ## ommit 3 for rolling mean
  87. dataset = dataset[num_steps:]
  88. # normalize the dataset
  89. dataset = scaler.transform(dataset)
  90. train_size = int(len(dataset) * 0.90)
  91. test_size = len(dataset) - train_size
  92. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  93. print(len(train), len(test))
  94. # get metadata
  95. meta = []
  96. try:
  97. pop_vote = src2pop[pkg_name][1]
  98. except KeyError:
  99. pop_vote = 0
  100. try:
  101. slocs_total = src2sloccount[pkg_name][0]
  102. except KeyError:
  103. slocs_total = 0
  104. pop_vote = scaler2.transform([[pop_vote]])
  105. slocs_total = scaler3.transform([[slocs_total]])
  106. meta.append(pop_vote)
  107. meta.append(slocs_total)
  108. # reshape into X=t and Y=t+1
  109. trainX, trainY = create_dataset(train, meta, look_back)
  110. testX, testY = create_dataset(test, meta, look_back)
  111. # reshape input to be [samples, time steps, features]
  112. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  113. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  114. if len(total_trainX) == 0:
  115. total_trainX = trainX
  116. total_trainY = trainY
  117. else:
  118. total_trainX = numpy.concatenate((total_trainX, trainX))
  119. total_trainY = numpy.concatenate((total_trainY, trainY))
  120. # save to dict for later
  121. trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
  122. testXdict[pkg_name], testYdict[pkg_name] = testX, testY
  123. # fit the LSTM network
  124. #model.fit([trainX], [trainY], nb_epoch=50, batch_size=50, verbose=2)
  125. ###################################################################################################
  126. # fit the LSTM network
  127. # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2)
  128. model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  129. model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  130. target = open('output5-3-lb' + str(look_back) + '.txt','w')
  131. target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w')
  132. # for pkg_name in ['chromium-browser', 'linux']:
  133. for pkg_name in src2month:
  134. dataset = src2month[pkg_name]
  135. dataset = dataset[:len(dataset)-8]
  136. if sum(dataset)>50:
  137. trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
  138. testX, testY = testXdict[pkg_name], testYdict[pkg_name]
  139. dataset = pandas.rolling_mean(dataset, window=num_steps)
  140. dataset = dataset[num_steps:]
  141. # normalize the dataset
  142. dataset = scaler.transform(dataset)
  143. # Freshen LSTM
  144. # train_size = int(len(dataset) * 0.90)
  145. # test_size = len(dataset) - train_size
  146. # train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  147. # print(len(train), len(test))
  148. #
  149. # # reshape into X=t and Y=t+1
  150. # trainX, trainY = create_dataset(train, look_back)
  151. # testX, testY = create_dataset(test, look_back)
  152. #
  153. # # reshape input to be [samples, time steps, features]
  154. # trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
  155. # testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
  156. #
  157. # # fit the LSTM network
  158. # model.fit(trainX, trainY, nb_epoch=10, batch_size=1, verbose=2)
  159. # make predictions
  160. trainPredict = model.predict(trainX)
  161. testPredict = model.predict(testX)
  162. # invert predictions
  163. trainPredict = scaler.inverse_transform(trainPredict)
  164. trainY = scaler.inverse_transform([trainY])
  165. testPredict = scaler.inverse_transform(testPredict)
  166. testY = scaler.inverse_transform([testY])
  167. print(str(len(dataset)))
  168. # calculate root mean squared error
  169. print('Package: ' + pkg_name)
  170. #print(trainY)
  171. #print(trainPredict)
  172. trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
  173. print('Train Score: %.2f RMSE' % (trainScore))
  174. testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
  175. print('Test Score: %.2f RMSE' % (testScore))
  176. # shift train predictions for plotting
  177. trainPredictPlot = numpy.empty_like(dataset)
  178. trainPredictPlot[:] = numpy.nan
  179. trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
  180. # shift test predictions for plotting
  181. testPredictPlot = numpy.empty_like(dataset)
  182. testPredictPlot[:] = numpy.nan
  183. testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
  184. # plot baseline and predictions
  185. plt.plot(scaler.inverse_transform(dataset))
  186. plt.plot(trainPredictPlot)
  187. plt.plot(testPredictPlot)
  188. plt.show()
  189. # output initial predictions
  190. #print('Actual number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(scaler.inverse_transform(dataset)[len(dataset)-2]))
  191. #print('Predicted number of vulnerabilities - last 3 months ' + pkg_name + ' ' + str(testPredictPlot[len(testPredictPlot)-2]))
  192. # save data to file
  193. target.write(pkg_name + '\n')
  194. target.write(str(dataset))
  195. target.write('\n')
  196. target.write(str(trainPredict))
  197. target.write(str(testPredict))
  198. target.write('\n')
  199. # load metadata again
  200. meta = []
  201. try:
  202. pop_vote = src2pop[pkg_name][1]
  203. except KeyError:
  204. pop_vote = 0
  205. try:
  206. slocs_total = src2sloccount[pkg_name][0]
  207. except KeyError:
  208. slocs_total = 0
  209. pop_vote = scaler2.transform([[pop_vote]])
  210. slocs_total = scaler3.transform([[slocs_total]])
  211. meta.append(pop_vote)
  212. meta.append(slocs_total)
  213. # make real predictions step by step
  214. l = len(dataset)
  215. pred_sofar = []
  216. for step in range(num_steps):
  217. input_list_pre = dataset[l-num_steps-1-look_back+step:l-num_steps-1]
  218. input_list = []
  219. for dat in input_list_pre:
  220. #input_list.append([dat]+meta)
  221. input_list.append([dat])
  222. for dat in pred_sofar:
  223. #input_list.append([dat]+meta)
  224. input_list.append([dat])
  225. pred_input = numpy.array(input_list)
  226. pred_input = numpy.reshape(pred_input, (1, pred_input.shape[0], pred_input.shape[1]))
  227. print(pred_input)
  228. pred = model.predict(pred_input)
  229. print("Prediction:")
  230. print(scaler.inverse_transform(pred)[0][0])
  231. pred_sofar.append(pred[0][0])
  232. actual = int(round(num_steps * scaler.inverse_transform(dataset)[len(dataset)-1]))
  233. predicted = int(round(num_steps * scaler.inverse_transform(pred_sofar)[(num_steps-1)]))
  234. print('Actual Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(actual))
  235. print('Predicted Vulnerabilities - last ' + str(num_steps) + ' months ' + pkg_name + ': ' + str(predicted))
  236. ## Calculate average for comparison
  237. raw_av = src2month[pkg_name]
  238. raw_av = raw_av[:len(dataset)-8]
  239. i = 0
  240. while(i<len(raw_av) and raw_av[i]<1 ):
  241. i = i + 1
  242. raw_av = raw_av[i:]
  243. if(len(raw_av) == 0):
  244. average = 0
  245. else:
  246. average = num_steps * int(round(sum(raw_av) / float(len(raw_av))))
  247. ## Write to file
  248. if(actual != 0):
  249. norm_diff = abs(actual-predicted)/actual
  250. else:
  251. norm_diff = actual-predicted
  252. target2.write(pkg_name + ',' + str(actual) + ',' + str(predicted) + ',' + str(average) + ',' + str(norm_diff) + '\n')