lstm_reg_NICE.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras.models import Sequential
  6. from keras.layers import Dense
  7. from keras.layers import LSTM
  8. from keras.layers import Activation, Dropout
  9. from keras.models import load_model
  10. from sklearn.preprocessing import MinMaxScaler
  11. from sklearn.metrics import mean_squared_error
  12. numpy.random.seed(7)
  13. # convert an array of values into a dataset matrix
  14. def create_dataset(dataset, meta, look_back=1):
  15. dataX, dataY = [], []
  16. for i in range(len(dataset)-look_back-1):
  17. a = []
  18. for j in range(i, i+look_back):
  19. #a.append([dataset[j]] + meta)
  20. a.append([dataset[j]])
  21. dataX.append(a)
  22. dataY.append(dataset[i + look_back])
  23. return numpy.array(dataX), numpy.array(dataY)
  24. def predict(src2month, src2sloccount, src2pop, src2deps):
  25. ## Number of features
  26. feat_num = 1
  27. ## Number of steps in the future to predict - also affects smoothing
  28. num_steps = 6
  29. pkg_num = len(src2month)
  30. training_num = len(src2month['linux']-8)
  31. trainXdict = dict()
  32. trainYdict = dict()
  33. testXdict = dict()
  34. testYdict = dict()
  35. look_back = 3
  36. train_size = int(len(src2month['linux']) - 20)
  37. test_size = len(src2month['linux']) - train_size
  38. batch_num = train_size - num_steps - look_back - 9
  39. print("batch_num:")
  40. print(batch_num)
  41. # create the LSTM network
  42. model = Sequential()
  43. model.add(LSTM(20, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True))
  44. model.add(Dense(1))
  45. model.compile(loss='mean_squared_error', optimizer='adam')
  46. scaler = MinMaxScaler(feature_range=(0, 1))
  47. scaler2 = MinMaxScaler(feature_range=(0,1))
  48. scaler3 = MinMaxScaler(feature_range=(0,1))
  49. test_scale = []
  50. for i in src2month:
  51. test_scale = numpy.concatenate((test_scale, src2month[i]))
  52. # for j in src2month[i]:
  53. # test_scale.append(src2month[i][j])
  54. scaler.fit(test_scale)
  55. test_scale = []
  56. for i in src2pop:
  57. test_scale.append(src2pop[i][1])
  58. scaler2.fit(test_scale)
  59. test_scale = []
  60. for i in src2sloccount:
  61. test_scale.append(src2sloccount[i][0])
  62. scaler3.fit(test_scale)
  63. total_trainX = []
  64. total_trainY = []
  65. flag = True
  66. ###################################################################################################
  67. for i in range(0,1):
  68. # for pkg_name in src2month:
  69. # for pkg_name in ['chromium-browser']:
  70. for pkg_name in ['linux']:
  71. pkg_num = len(src2month)
  72. dataset = src2month[pkg_name]
  73. dataset = dataset[:len(dataset)-8]
  74. print(len(dataset))
  75. #
  76. if sum(dataset)>50:
  77. #
  78. dataset = pandas.rolling_mean(dataset, window=num_steps)
  79. ## ommit for rolling mean
  80. dataset = dataset[num_steps:]
  81. # normalize the dataset
  82. dataset = scaler.transform(dataset)
  83. train_size = len(dataset) - 20
  84. test_size = len(dataset) - train_size
  85. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  86. print(len(train), len(test))
  87. # get metadata
  88. meta = []
  89. try:
  90. pop_vote = src2pop[pkg_name][1]
  91. except KeyError:
  92. pop_vote = 0
  93. try:
  94. slocs_total = src2sloccount[pkg_name][0]
  95. except KeyError:
  96. slocs_total = 0
  97. pop_vote = scaler2.transform([[pop_vote]])
  98. slocs_total = scaler3.transform([[slocs_total]])
  99. meta.append(pop_vote)
  100. meta.append(slocs_total)
  101. # reshape into X=t and Y=t+1
  102. trainX, trainY = create_dataset(train, meta, look_back)
  103. testX, testY = create_dataset(test, meta, look_back)
  104. # reshape input to be [samples, time steps, features]
  105. trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
  106. testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
  107. print(numpy.shape(trainX), numpy.shape(testX))
  108. #if len(total_trainX) == 0:
  109. # total_trainX = trainX
  110. # total_trainY = trainY
  111. #else:
  112. # total_trainX = numpy.concatenate((total_trainX, trainX))
  113. # total_trainY = numpy.concatenate((total_trainY, trainY))
  114. # save to dict for later
  115. trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
  116. testXdict[pkg_name], testYdict[pkg_name] = testX, testY
  117. # fit the LSTM network
  118. for i in range (3000):
  119. model.fit([trainX], [trainY], nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
  120. model.reset_states()
  121. ###################################################################################################
  122. # fit the LSTM network
  123. # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2)
  124. model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  125. # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
  126. target = open('output5-3-lb' + str(look_back) + '.txt','w')
  127. target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w')
  128. # for pkg_name in ['chromium-browser']:
  129. for pkg_name in ['linux']:
  130. # for pkg_name in src2month:
  131. model.reset_states()
  132. dataset = src2month[pkg_name]
  133. dataset = dataset[:len(dataset)-8]
  134. if sum(dataset)>50:
  135. trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
  136. testX, testY = testXdict[pkg_name], testYdict[pkg_name]
  137. print(numpy.shape(trainX), numpy.shape(trainY))
  138. #
  139. dataset = pandas.rolling_mean(dataset, window=num_steps)
  140. dataset = dataset[num_steps:]
  141. # normalize the dataset
  142. dataset = scaler.transform(dataset)
  143. # # make predictions
  144. trainPredict = model.predict(trainX, batch_size = batch_num)
  145. model.reset_states()
  146. new = []
  147. test_batch_complete = []
  148. for i in range(len(testX)):
  149. if new == []:
  150. test_batch_complete = numpy.concatenate((trainX[len(testX):],testX))
  151. testPredict = model.predict(test_batch_complete, batch_size = batch_num)
  152. new = testPredict[len(testPredict)-3:len(testPredict)]
  153. print(new)
  154. model.reset_states()
  155. testPredict = testPredict[len(trainX)-len(testX):]
  156. print("shape: trainPredict, shape: testPredict")
  157. print(numpy.shape(trainPredict),numpy.shape(testPredict))
  158. else:
  159. test_batch_complete = test_batch_complete[1:]
  160. print(numpy.shape(new))
  161. print(numpy.shape(test_batch_complete))
  162. test_batch_complete = numpy.append(test_batch_complete, [new], axis=0)
  163. testPredict = model.predict(test_batch_complete, batch_size = batch_num)
  164. new = testPredict[len(testPredict)-3:len(testPredict)]
  165. model.reset_states()
  166. testPredict = testPredict[len(trainX)-len(testX):]
  167. print("shape: trainPredict, shape: testPredict")
  168. print(numpy.shape(trainPredict),numpy.shape(testPredict))
  169. # invert predictions
  170. trainPredict = scaler.inverse_transform(trainPredict)
  171. trainY = scaler.inverse_transform([trainY])
  172. testPredict = scaler.inverse_transform(testPredict)
  173. testY = scaler.inverse_transform([testY])
  174. # calculate root mean squared error
  175. print('Package: ' + pkg_name)
  176. #print(trainY)
  177. #print(trainPredict)
  178. trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
  179. print('Train Score: %.2f RMSE' % (trainScore))
  180. testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
  181. print('Test Score: %.2f RMSE' % (testScore))
  182. # shift train predictions for plotting
  183. trainPredictPlot = numpy.empty_like(dataset)
  184. trainPredictPlot[:] = numpy.nan
  185. trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
  186. # shift test predictions for plotting
  187. testPredictPlot = numpy.empty_like(dataset)
  188. testPredictPlot[:] = numpy.nan
  189. testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
  190. # plot baseline and predictions
  191. plt.plot(scaler.inverse_transform(dataset))
  192. plt.plot(trainPredictPlot)
  193. plt.plot(testPredictPlot)
  194. plt.show()
  195. ## Calculate average for comparison
  196. # raw_av = src2month[pkg_name]
  197. # raw_av = raw_av[:len(dataset)-8]
  198. # i = 0
  199. # while(i<len(raw_av) and raw_av[i]<1 ):
  200. # i = i + 1
  201. #
  202. # raw_av = raw_av[i:]
  203. # if(len(raw_av) == 0):
  204. # average = 0
  205. # else:
  206. # average = num_steps * int(round(sum(raw_av) / float(len(raw_av))))
  207. #
  208. # ## Write to file
  209. # if(actual != 0):
  210. # norm_diff = abs(actual-predicted)/actual
  211. # else:
  212. # norm_diff = actual-predicted
  213. # target2.write(pkg_name + ',' + str(actual) + ',' + str(predicted) + ',' + str(average) + ',' + str(norm_diff) + '\n')