lstm_reg.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import numpy
  2. import matplotlib.pyplot as plt
  3. import pandas
  4. import math
  5. from keras.models import Sequential
  6. from keras.layers import Dense
  7. from keras.layers import LSTM
  8. from keras.layers import Activation
  9. from sklearn.preprocessing import MinMaxScaler
  10. from sklearn.metrics import mean_squared_error
  11. numpy.random.seed(7)
  12. # convert an array of values into a dataset matrix
  13. def create_dataset(dataset, look_back=1):
  14. dataX, dataY = [], []
  15. for i in range(len(dataset)-look_back-1):
  16. a = dataset[i:(i+look_back)]
  17. dataX.append(a)
  18. dataY.append(dataset[i + look_back])
  19. return numpy.array(dataX), numpy.array(dataY)
  20. def predict(src2month):
  21. pkg_num = len(src2month)
  22. training_num = len(src2month['linux'])
  23. trainXdict = dict()
  24. trainYdict = dict()
  25. testXdict = dict()
  26. testYdict = dict()
  27. look_back = 4
  28. # create the LSTM network
  29. model = Sequential()
  30. model.add(LSTM(32, input_dim=look_back, activation ='relu', dropout_W =0.1, dropout_U =0.1))
  31. # model.add(Dense(12, init='uniform', activation='relu'))
  32. # model.add(Dense(8, init='uniform', activation='relu'))
  33. # model.add(Dense(1, init='uniform', activation='sigmoid'))
  34. # model.add(LSTM(4, input_dim=look_back-6, dropout_W = 0.2, dropout_U = 0.1))
  35. model.add(Dense(1))
  36. model.compile(loss='mean_squared_error', optimizer='adam')
  37. scaler = MinMaxScaler(feature_range=(0, 1))
  38. flag = True
  39. ###################################################################################################
  40. for pkg_name in src2month:
  41. # for pkg_name in ['icedove', 'mysql', 'xulrunner', 'wireshark', 'firefox', 'openjdk', 'php5', 'iceape', 'wordpress', 'xen', 'openssl', 'chromium-browser', 'linux']:
  42. # for pkg_name in ['linux']:
  43. pkg_num = len(src2month)
  44. dataset = src2month[pkg_name]
  45. if sum(dataset)>20:
  46. dataset = pandas.rolling_mean(dataset, window=12)
  47. dataset = dataset[12:]
  48. # normalize the dataset
  49. dataset = scaler.fit_transform(dataset)
  50. train_size = int(len(dataset) * 0.80)
  51. test_size = len(dataset) - train_size
  52. train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
  53. print(len(train), len(test))
  54. # reshape into X=t and Y=t+1
  55. trainX, trainY = create_dataset(train, look_back)
  56. testX, testY = create_dataset(test, look_back)
  57. # reshape input to be [samples, time steps, features]
  58. trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
  59. testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
  60. # save to dict for later
  61. trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
  62. testXdict[pkg_name], testYdict[pkg_name] = testX, testY
  63. # fit the LSTM network
  64. model.fit(trainX, trainY, nb_epoch=10, batch_size=1, verbose=2)
  65. ###################################################################################################
  66. model.save('all_packages_test.h5')
  67. for pkg_name in ['icedove', 'mysql', 'xulrunner', 'wireshark', 'firefox', 'openjdk', 'php5', 'iceape', 'wordpress', 'xen', 'openssl', 'chromium-browser', 'linux']:
  68. trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
  69. testX, testY = testXdict[pkg_name], testYdict[pkg_name]
  70. dataset = src2month[pkg_name]
  71. dataset = pandas.rolling_mean(dataset, window=12)
  72. dataset = dataset[12:]
  73. # normalize the dataset
  74. dataset = scaler.fit_transform(dataset)
  75. # make predictions
  76. trainPredict = model.predict(trainX)
  77. testPredict = model.predict(testX)
  78. # invert predictions
  79. trainPredict = scaler.inverse_transform(trainPredict)
  80. trainY = scaler.inverse_transform([trainY])
  81. testPredict = scaler.inverse_transform(testPredict)
  82. testY = scaler.inverse_transform([testY])
  83. # calculate root mean squared error
  84. print('Package: ' + pkg_name)
  85. trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
  86. print('Train Score: %.2f RMSE' % (trainScore))
  87. testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
  88. print('Test Score: %.2f RMSE' % (testScore))
  89. # shift train predictions for plotting
  90. trainPredictPlot = numpy.empty_like(dataset)
  91. trainPredictPlot[:] = numpy.nan
  92. trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
  93. # shift test predictions for plotting
  94. testPredictPlot = numpy.empty_like(dataset)
  95. testPredictPlot[:] = numpy.nan
  96. testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
  97. # plot baseline and predictions
  98. plt.plot(scaler.inverse_transform(dataset))
  99. plt.plot(trainPredictPlot)
  100. plt.plot(testPredictPlot)
  101. plt.show()