prediction.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773
  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. import sys
  6. from keras import backend as K
  7. from keras.models import Sequential
  8. from keras.layers import LSTM
  9. from keras.layers import GRU
  10. from keras import optimizers as opt
  11. from keras.layers import Dense
  12. from keras.layers import Activation, Dropout
  13. from keras.models import load_model
  14. from sklearn.preprocessing import MinMaxScaler
  15. from sklearn.metrics import mean_squared_error
  16. import copy
  17. import random
  18. import arimaPred as arim
  19. import paper_plots as carlosplt
  20. np.random.seed(7)
  21. EPSILON = 10 ** -12
  22. def moving_average(a, n) :
  23. ## code from Jaime
  24. ret = np.cumsum(a, dtype=float)
  25. ret[n:] = ret[n:] - ret[:-n]
  26. ret = np.concatenate((np.zeros(n-1), ret[n-1:]))
  27. return ret / n
  28. # convert an array of values into a dataset matrix
  29. # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
  30. def create_dataset(original_dataset, dataset, num_steps, look_back=1, feat_num=1, extra_features=[]):
  31. dataX, dataY = [], []
  32. print('Making training length :', len(original_dataset), ' ; ', len(dataset))
  33. for i in range(len(dataset)-look_back-num_steps +1):
  34. a = []
  35. for j in range(i, i+look_back):
  36. if feat_num>1:
  37. a.append([original_dataset[j]]+extra_features.tolist())
  38. else:
  39. a.append([original_dataset[j]])
  40. dataX.append(a)
  41. mean = 0
  42. for j in range(num_steps):
  43. mean += original_dataset[i+look_back+j]
  44. dataY.append(mean/num_steps)
  45. return np.array(dataX), np.array(dataY)
  46. def test_model(pkg_name, original_dataset, dataset, scaler, num_steps, smoothing, batch_num, look_back, test_size, mixed, feat_num=1, extra_features=[]):
  47. ## mixed is a boolean. When True, the model trained on all the packages is used. When false, each package has its own model.
  48. totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_features)
  49. K.clear_session()
  50. if mixed:
  51. new_model = load_model('./models/all_packages'+str(num_steps) + '.h5')
  52. else:
  53. new_model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
  54. new_model.reset_states()
  55. print(len(totalX))
  56. totalPredict = new_model.predict(totalX[len(totalX)-batch_num:], batch_size = batch_num)
  57. del new_model
  58. #scaler = scalers[pkg_name]
  59. totalPredict = scaler.inverse_transform(totalPredict)
  60. totalPredict = totalPredict.flatten()
  61. total_LSTM = [0] * (len(dataset)-len(totalPredict)) + totalPredict.tolist()
  62. #print(len(total_LSTM))
  63. #print(total_LSTM)
  64. #totalY = totalY[18:]
  65. totalY = totalY.reshape(-1,1)
  66. totalY = scaler.inverse_transform(totalY)
  67. totalY = totalY.flatten()
  68. temp = dataset.reshape(-1,1)
  69. temp = scaler.inverse_transform(temp)
  70. temp = temp.flatten()
  71. if (math.fabs(totalY[-1]-temp[-1])>EPSILON):
  72. print("Possible fault!!", totalY[-1], temp[-1])
  73. return total_LSTM[:]
  74. def predict_stationary(original, dataset, num_steps):
  75. prediction = dict()
  76. for pkg in dataset:
  77. data = dataset[pkg]
  78. pred = []
  79. i = 0
  80. for month in data:
  81. if i<num_steps:
  82. pred.append(0)
  83. else:
  84. pred.append(data[i-num_steps])
  85. i+=1
  86. prediction[pkg] = pred[:]
  87. return prediction
  88. def predict_average(original, dataset, num_steps):
  89. prediction = dict()
  90. for pkg in dataset:
  91. o_data = dataset[pkg]
  92. average_list = []
  93. i = 0
  94. j = i
  95. flag = True
  96. for month in o_data:
  97. if(month==0 and flag):
  98. average_list.append(0)
  99. i+=1
  100. j=i
  101. else:
  102. flag = False
  103. if(len(o_data[i:j])<=num_steps):
  104. average_list.append(0)
  105. else:
  106. # average up until before the time
  107. average = sum(o_data[i:j-num_steps])/len(o_data[i:j-num_steps])
  108. average_list.append(average)
  109. j += 1
  110. prediction[pkg]=average_list[:]
  111. return prediction
  112. def predict_Waverage(original, dataset, Lamd, num_steps):
  113. prediction = dict()
  114. for pkg in dataset:
  115. o_data = dataset[pkg]
  116. waverage_list = []
  117. i = 0
  118. j = i
  119. flag = True
  120. for month in o_data:
  121. if(month == 0 and flag):
  122. waverage_list.append(0)
  123. i += 1
  124. j = i
  125. else:
  126. flag = False
  127. if(len(o_data[i:j])<=num_steps):
  128. waverage_list.append(0)
  129. else:
  130. w_average = 0
  131. weights = 0
  132. jj = 0
  133. local = o_data[i:j-num_steps]
  134. for k in local:
  135. w_average += k * math.exp(-(len(local) - jj - 1)/Lamd)
  136. weights += math.exp(-(len(local) - jj - 1)/Lamd)
  137. jj += 1
  138. if (weights==0):
  139. w_average = 0
  140. else:
  141. w_average = w_average/weights
  142. waverage_list.append(w_average)
  143. j += 1
  144. prediction[pkg] = waverage_list[:]
  145. return prediction
  146. def train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler):
  147. model.set_weights(Wsave)
  148. ## Set the initial weights - remove if we want one model for all the packages
  149. train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
  150. train, test = data[0:train_size], data[train_size:len(data)]
  151. if (len(original_data) != len(data)):
  152. return(1)
  153. print('Training and test size in months: ', len(train), ' ; ', len(test))
  154. trainX, trainY = create_dataset(train_original, train, num_steps, look_back)
  155. testX, testY = create_dataset(test_original, test, num_steps, look_back)
  156. # reshape input to be [samples, time steps, features]
  157. trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  158. testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  159. print(len(trainX), len(testX))
  160. print(len(trainY), len(testY))
  161. trainY.reshape(-1,1)
  162. testY.reshape(-1,1)
  163. fig = plt.gcf()
  164. fig.clf()
  165. fig.show()
  166. fig.canvas.draw()
  167. training_steps = 400*25
  168. trainerror = []
  169. valerror=[]
  170. for j in range (training_steps):
  171. model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
  172. model.reset_states()
  173. if(j%25==0):
  174. calc_errors(model, batch_num, original_data, data, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, [])
  175. plt.plot(trainerror, color='blue')
  176. plt.plot(valerror, color='red')
  177. plt.title(pkg_name)
  178. fig.canvas.draw()
  179. try:
  180. model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
  181. #model.save('./models/all_packages'+str(num_steps) + '.h5')
  182. del model
  183. except OSError:
  184. model.save('./models/unknown-' + str(num_steps) + '.h5')
  185. del model
  186. def train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler, extra_features):
  187. train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
  188. train, test = data[0:train_size], data[train_size:len(data)]
  189. if (len(original_data) != len(data)):
  190. return(1)
  191. print('Training and test size in months: ', len(train), ' ; ', len(test))
  192. trainX, trainY = create_dataset(train_original, train, num_steps, look_back, feat_num, extra_features)
  193. testX, testY = create_dataset(test_original, test, num_steps, look_back, feat_num, extra_features)
  194. # reshape input to be [samples, time steps, features]
  195. trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  196. testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  197. trainY.reshape(-1,1)
  198. testY.reshape(-1,1)
  199. trainerror = []
  200. valerror = []
  201. model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
  202. model.reset_states()
  203. return 0
  204. def calc_errors(model, batch_num, original_dataset, dataset, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, extra_feature):
  205. totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_feature)
  206. totalX = totalX[len(totalX)-batch_num:]
  207. totalY = totalY[len(totalY)-batch_num:]
  208. model.reset_states()
  209. totalPredict = model.predict(totalX, batch_size = batch_num)
  210. totalPredict = scaler.inverse_transform(totalPredict)
  211. totalPredict = totalPredict.flatten()
  212. totalY = totalY.reshape(-1,1)
  213. totalY = scaler.inverse_transform(totalY)
  214. totalY = totalY.flatten()
  215. trainerror.append(mean_squared_error(totalPredict[50:-test_size], totalY[50:-test_size]))
  216. valerror.append(mean_squared_error(totalPredict[-test_size:], totalY[-test_size:]))
  217. return 0
  218. def calc_errors_all(model, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features):
  219. temp1 = 0
  220. temp2 = 0
  221. train_temp = []
  222. test_temp = []
  223. for pkg in data_in:
  224. scaler = scalers[pkg]
  225. calc_errors(model, batch_num, original_data_in[pkg], data_in[pkg], num_steps, look_back, test_size, pkg, train_temp, test_temp, scaler, feat_num, extra_features[pkg]) ** 2
  226. for errors in train_temp:
  227. temp1 += errors ** 2
  228. for errors in test_temp:
  229. temp2 += errors ** 2
  230. trainerror.append(math.sqrt(temp1)/len(data_in))
  231. valerror.append(math.sqrt(temp2)/len(data_in))
  232. return 0
  233. def predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train):
  234. # Model parameters
  235. # Do testing?
  236. do_test = True
  237. # Number of different models to train for
  238. models_num = 5
  239. # Look back steps
  240. look_back = 9
  241. # Number of lstm neurons
  242. num_neurons = look_back
  243. num_neurons2 = look_back
  244. # in case we want to add more features in the future
  245. feat_num = 1
  246. data_size = len(dataset[first])
  247. print(data_size)
  248. train_size = int(data_size - test_size)
  249. batch_num = train_size - num_steps - look_back + 1
  250. print("batch_num: ", batch_num)
  251. ## Create the NN with Keras
  252. model = Sequential()
  253. #model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
  254. model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True))
  255. #model.add(LSTM(num_neurons2, activation ='relu', dropout=0.4, stateful=True))
  256. model.add(Dense(1))
  257. Adam = opt.Adam(lr=0.001)
  258. model.compile(loss='mean_squared_error', optimizer=Adam)
  259. Wsave = model.get_weights()
  260. pred_LSTM = dict()
  261. data_in = dict()
  262. original_data_in = dict()
  263. scalers = dict()
  264. ## Let's start preparing our data
  265. for pkg in dataset:
  266. data = dataset[pkg]
  267. data = data.reshape(-1,1)
  268. scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
  269. scalers[pkg].fit(data)
  270. ## We use scaler for each package seperately
  271. data = scalers[pkg].transform(data)
  272. data = data.flatten()
  273. original_data = original[pkg]
  274. original_data = original_data.reshape(-1,1)
  275. original_data = scalers[pkg].transform(original_data)
  276. original_data = original_data.flatten()
  277. data_in[pkg] = data
  278. original_data_in[pkg] = original_data
  279. ## Compute the total number of reported vulnerabilities in case we need it later
  280. total_sum = sum(original[pkg])
  281. ## Let's start with the training - if we want to train ofc...
  282. if do_train:
  283. ## Just a test to have one LSTM for all packages
  284. for i in range(1):
  285. for pkg in dataset:
  286. ## random selection for mixed training
  287. ## CHANGE for dedicated models
  288. #pkg = random.choice(list(dataset.keys()))
  289. data = data_in[pkg]
  290. original_data = original_data_in[pkg]
  291. train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg, scalers[pkg])
  292. if do_test:
  293. for pkg in dataset:
  294. data = data_in[pkg]
  295. original_data = original_data_in[pkg]
  296. pred_LSTM[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, False)
  297. return pred_LSTM
  298. def predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train):
  299. # Model parameters
  300. do_test = True
  301. # Look back steps
  302. look_back = 9
  303. feat_num = 1+len(dataset)
  304. # Number of lstm neurons
  305. num_neurons = feat_num + 10
  306. num_neurons2 = look_back
  307. # in case we want to add more features in the future
  308. extra_features = dict()
  309. ## Training steps
  310. training_steps = 600*25
  311. ## Use one extra feature to signal package identity
  312. i = 0
  313. for pkg in dataset:
  314. extra_features[pkg] = np.asarray([0]*i + [1] + [0]*(len(dataset)-i-1))
  315. extra_features[pkg].reshape(-1,1)
  316. data_size = len(dataset[first])
  317. print(data_size)
  318. train_size = int(data_size - test_size)
  319. batch_num = train_size - num_steps - look_back + 1
  320. print("batch_num: ", batch_num)
  321. ## Create the NN with Keras
  322. model2 = Sequential()
  323. #model2.add(Dense(units=num_neurons, activation='relu', batch_input_shape=(batch_num, look_back,feat_num)))
  324. model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
  325. #model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.1, stateful=True))
  326. model2.add(LSTM(num_neurons, activation ='relu', dropout=0.1, stateful=True))
  327. #model2.add(Dense(num_neurons))
  328. model2.add(Dense(1))
  329. Adam = opt.Adam(lr=0.001)
  330. model2.compile(loss='mean_squared_error', optimizer=Adam)
  331. Wsave = model2.get_weights()
  332. pred_LSTM_all = dict()
  333. data_in = dict()
  334. original_data_in = dict()
  335. scalers = dict()
  336. ## Let's start preparing our data
  337. for pkg in dataset:
  338. data = dataset[pkg]
  339. data = data.reshape(-1,1)
  340. scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
  341. scalers[pkg].fit(data)
  342. ## We use scaler for each package seperately
  343. data = scalers[pkg].transform(data)
  344. data = data.flatten()
  345. original_data = original[pkg]
  346. original_data = original_data.reshape(-1,1)
  347. original_data = scalers[pkg].transform(original_data)
  348. original_data = original_data.flatten()
  349. data_in[pkg] = data
  350. original_data_in[pkg] = original_data
  351. ## Compute the total number of reported vulnerabilities in case we need it later
  352. total_sum = sum(original[pkg])
  353. ## Let's start with the training - if we want to train ofc...
  354. if do_train:
  355. ## Just a test to have one LSTM for all packages
  356. fig = plt.gcf()
  357. fig.clf()
  358. fig.show()
  359. fig.canvas.draw()
  360. trainerror = []
  361. valerror=[]
  362. for i in range(training_steps):
  363. ## random selection for mixed training
  364. pkg = random.choice(list(dataset.keys()))
  365. data = data_in[pkg]
  366. original_data = original_data_in[pkg]
  367. train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model2, Wsave, feat_num, look_back, pkg, scalers[pkg], extra_features[pkg])
  368. if(i%25==0):
  369. calc_errors_all(model2, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features)
  370. plt.plot(trainerror, color='blue')
  371. plt.plot(valerror, color='red')
  372. plt.title('Mixed')
  373. fig.canvas.draw()
  374. try:
  375. model2.save('./models/all_packages'+str(num_steps) + '.h5')
  376. del model2
  377. except OSError:
  378. model2.save('./models/unknown-' + str(num_steps) + '.h5')
  379. if do_test:
  380. for pkg in dataset:
  381. data = data_in[pkg]
  382. original_data = original_data_in[pkg]
  383. pred_LSTM_all[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, True, feat_num, extra_features[pkg])
  384. return pred_LSTM_all
  385. def load_dataset_smoothed(src2month, pkglist, smoothing):
  386. temp = dict()
  387. original = dict()
  388. for pkg in pkglist:
  389. temp1 = np.asarray(src2month[pkg])
  390. temp2 = temp1
  391. # Smooth the time-series with a moving average
  392. temp1 = moving_average(temp1, n=smoothing)
  393. temp1 = temp1[smoothing:]
  394. temp2 = temp2[smoothing:]
  395. ## Cut off leading part
  396. temp[pkg] = temp1
  397. original[pkg] = temp2
  398. return (original, temp)
  399. def print_all(data, pkglist):
  400. plt.figure(1)
  401. i = 1
  402. for pkg in pkglist:
  403. plt.subplot(2,5,i)
  404. plt.plot(data[pkg], label = pkg)
  405. plt.legend()
  406. i += 1
  407. plt.show()
  408. def print_pred(data, pred):
  409. plt.plot(data, color='blue', label='reality')
  410. plt.plot(pred, color='red', label='model')
  411. plt.legend()
  412. plt.show()
  413. def print_all_pred(data, pred, pkglist, test_size):
  414. carlosplt.pre_paper_plot(True)
  415. plt.figure(1)
  416. i = 1
  417. ## Build x axis
  418. quartersx = []
  419. for y in range(2,19):
  420. start = 1
  421. end = 5
  422. if y == 2:
  423. start = 2
  424. elif y == 19:
  425. end = 2
  426. for j in range(start,end):
  427. if j==1 and y%2==1:
  428. quartersx.append('\''+str(y).zfill(2))
  429. else:
  430. quartersx.append(' ')
  431. for pkg in pkglist:
  432. data_local = data[pkg]
  433. data_local = data_local.flatten()
  434. ax = plt.subplot(1,5,i)
  435. data_train = data_local[:-test_size]
  436. data_test = data_local[-test_size:]
  437. if pkg == 'firefox-esr':
  438. pkg_label = 'firefox'
  439. elif pkg == 'chromium-browser':
  440. pkg_label = 'chromium'
  441. elif pkg == 'openjdk-8':
  442. pkg_label = 'openjdk'
  443. else:
  444. pkg_label = pkg
  445. x_axis_train = []
  446. x_axis_test = []
  447. for j in range(len(data_train)):
  448. x_axis_train.append(j)
  449. for j in range(len(data_test)):
  450. x_axis_test.append(j+len(data_train))
  451. x_axis_all = x_axis_train + x_axis_test
  452. if i==5:
  453. train=ax.plot(x_axis_train, data_train, color = 'grey', label='real-tr')
  454. test=ax.plot(x_axis_test, data_test, color = 'coral', label = 'real-te')
  455. model=ax.plot(x_axis_all,pred[pkg], color ='blue', label='model')
  456. ax.legend()
  457. else:
  458. ax.plot(x_axis_train, data_train, color = 'grey')
  459. ax.plot(x_axis_test, data_test, color = 'coral')
  460. ax.plot(x_axis_all,pred[pkg], color ='blue')
  461. ax.spines['right'].set_visible(False)
  462. ax.spines['top'].set_visible(False)
  463. ax.set_title(pkg_label)
  464. plt.xticks(np.arange(1,len(pred[pkg]),3.0)+1,quartersx, rotation="vertical")
  465. i += 1
  466. carlosplt.post_paper_plot(True,True,True)
  467. plt.show()
  468. def normalize_errors(errors_list, pkglist, dataset, test_size):
  469. for pkg in errors_list:
  470. maximum = np.amax(dataset[pkg][-test_size:])
  471. minimum = np.amin(dataset[pkg][-test_size:])
  472. norm = maximum-minimum
  473. if norm<0.1:
  474. norm = 0.1
  475. for d in [errors_list]:
  476. d[pkg] = d[pkg]/norm
  477. def compute_errors_perpackage(prediction, dataset, test_size):
  478. temp_errors = dict()
  479. for pkg in prediction:
  480. temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][-test_size:], dataset[pkg][-test_size:]))
  481. return temp_errors
  482. def compute_errors_train_perpackage(prediction, dataset, test_size):
  483. temp_errors = dict()
  484. for pkg in prediction:
  485. temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][:-test_size], dataset[pkg][:-test_size]))
  486. return temp_errors
  487. def find_best_Lamd(original, dataset, num_steps, test_size):
  488. pred_Waverage_temp = dict()
  489. pred_Waverage = dict()
  490. errors = dict()
  491. best_errors = dict()
  492. best_lamdas = dict()
  493. dataset_temp = dict()
  494. pred_Waverage = predict_Waverage(original, dataset, 1, num_steps)
  495. errors[1] = compute_errors_train_perpackage(pred_Waverage, dataset, test_size)
  496. best_errors = errors[1]
  497. for pkg in dataset:
  498. best_lamdas[pkg] = 1
  499. for Lamd in range(1,100):
  500. pred_Waverage_temp = predict_Waverage(original, dataset, Lamd, num_steps)
  501. ## To compute best lamda
  502. errors[Lamd] = compute_errors_train_perpackage(pred_Waverage_temp, dataset, test_size)
  503. for pkg in pred_Waverage_temp:
  504. print(errors[Lamd][pkg])
  505. if errors[Lamd][pkg] < best_errors[pkg]:
  506. best_errors[pkg] = errors[Lamd][pkg]
  507. best_lamdas[pkg] = Lamd
  508. for pkg in dataset:
  509. pred_Waverage[pkg] = predict_Waverage(original, dataset, best_lamdas[pkg], num_steps)[pkg]
  510. print(best_lamdas)
  511. return pred_Waverage
  512. def do_training_errors(predictions_list, errors, dataset, test_size):
  513. ## First for each package
  514. for method in predictions_list:
  515. temp_dict = dict()
  516. try:
  517. temp_dict = compute_errors_train_perpackage(predictions_list[method], dataset, test_size)
  518. errors[method] = temp_dict
  519. except:
  520. print('Predictions missing')
  521. def do_testing_errors(predictions_list, errors, dataset, test_size):
  522. for method in predictions_list:
  523. temp_dict = dict()
  524. try:
  525. temp_dict = compute_errors_perpackage(predictions_list[method], dataset, test_size)
  526. errors[method] = temp_dict
  527. except:
  528. print('Predictions missing')
  529. def calculate_rmse(errors):
  530. temp = 0
  531. for pkg in errors:
  532. temp += errors[pkg]*errors[pkg]
  533. temp = math.sqrt(temp)/len(errors)
  534. return temp
  535. def calculate_mean(errors):
  536. temp = 0
  537. for pkg in errors:
  538. temp += errors[pkg]
  539. temp = temp/len(errors)
  540. return temp
  541. def print_summary(training_errors, testing_errors):
  542. print('#'*80)
  543. print('***** REPORT *****')
  544. for method in training_errors:
  545. print(method)
  546. print('Training Errors rmse: ', '%.3f' % training_errors[method]['rmse'])
  547. print('Testing Errors rmse: ', '%.3f' % testing_errors[method]['rmse'])
  548. print('Training Errors mean: ', '%.3f' %training_errors[method]['mean'])
  549. print('Testing Errors mean: ', '%.3f' %testing_errors[method]['mean'])
  550. print('#'*80)
  551. return 0
  552. def predict(src2month, k):
  553. pkglist=[]
  554. for pkg in src2month:
  555. if (sum(src2month[pkg])>50):
  556. pkglist.append(pkg)
  557. #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'wireshark', 'openjdk-8', 'mysql-transitional', 'php7.0', 'imagemagick', 'tcpdump']
  558. #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'openjdk-8']
  559. pkglist = ['icedove']
  560. first = pkglist[0]
  561. #pkglist = [first]
  562. # Number of months in the future
  563. num_steps = 9
  564. smoothing = num_steps
  565. # Test dataset size in months
  566. test_size = 18
  567. real_test_size = 18
  568. do_train = False
  569. dataset = dict()
  570. # Cut out end of 2018
  571. for pkg in pkglist:
  572. ## This is for training
  573. if do_train:
  574. src2month[pkg] = src2month[pkg][:-9-real_test_size]
  575. ## This is for experiments
  576. else:
  577. src2month[pkg] = src2month[pkg][test_size:-9]
  578. (original,dataset) = load_dataset_smoothed(src2month, pkglist, smoothing)
  579. # Each point of dataset is the mean of the same point and the previous smoothing-1 of the original
  580. num_packages = len(pkglist)
  581. # Print all smoothed time-series
  582. #print_all(dataset, pkglist)
  583. ## Make simple predictions (stationary, average, waverage)
  584. predictions_list = dict()
  585. predictions_list['stationary'] = predict_stationary(original, dataset, num_steps)
  586. predictions_list['average'] = predict_average(original, dataset, num_steps)
  587. predictions_list['Waverage'] = find_best_Lamd(original, dataset, num_steps, test_size)
  588. #predictions_list['LSTM'] = predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train)
  589. #predictions_list['LSTM_all'] = predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train)
  590. #print_all_pred(dataset, predictions_list['LSTM'], pkglist, test_size)
  591. #pkglist_new=['linux','firefox-esr', 'chromium-browser', 'icedove']
  592. pkglist_new = pkglist
  593. print_all_pred(dataset, predictions_list['Waverage'], pkglist_new, test_size)
  594. training_errors = dict()
  595. ## Dictionary of training errors e.g. training_errors['LSTM']['linux'] = XXX
  596. testing_errors = dict()
  597. ## Same for testing errors
  598. new_predictions_list = dict()
  599. ## For which packages to compute the error?
  600. for method in predictions_list:
  601. new_predictions_list[method] = dict()
  602. for pkg in predictions_list[method]:
  603. if (sum(src2month[pkg])>200):
  604. new_predictions_list[method][pkg] = predictions_list[method][pkg]
  605. print(new_predictions_list)
  606. do_training_errors(new_predictions_list, training_errors, dataset, test_size)
  607. do_testing_errors(new_predictions_list, testing_errors, dataset, test_size)
  608. ## Now among the packages again rmse. But first we normalize. Choose whether we want this or not
  609. for method in training_errors:
  610. normalize_errors(training_errors[method], pkglist, dataset, test_size)
  611. normalize_errors(testing_errors[method], pkglist, dataset, test_size)
  612. for pkg in training_errors['average']:
  613. print('#'*80)
  614. print(pkg)
  615. print('Training errors:')
  616. temp_list = []
  617. for method in training_errors:
  618. string = method + ': ' + str(training_errors[method][pkg]) + ' , '
  619. temp_list.append(string)
  620. print(temp_list)
  621. temp_list = []
  622. for method in training_errors:
  623. string = method + ': ' + str(testing_errors[method][pkg])
  624. temp_list.append(string)
  625. print('Testing errors:')
  626. print(temp_list)
  627. ## Now it is time for the rmse among the packages
  628. for method in testing_errors:
  629. testing_errors[method]['rmse'] = calculate_rmse(testing_errors[method])
  630. testing_errors[method]['mean'] = calculate_mean(testing_errors[method])
  631. training_errors[method]['rmse'] = calculate_rmse(training_errors[method])
  632. training_errors[method]['mean'] = calculate_mean(training_errors[method])
  633. print_summary(training_errors, testing_errors)
  634. return