import numpy import matplotlib.pyplot as plt import pandas import math from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error numpy.random.seed(7) # convert an array of values into a dataset matrix def create_dataset(dataset, meta, look_back=1): dataX, dataY = [], [] for i in range(len(dataset)-look_back-1): a = [] for j in range(i, i+look_back): #a.append([dataset[j]] + meta) a.append([dataset[j]]) dataX.append(a) dataY.append(dataset[i + look_back]) return numpy.array(dataX), numpy.array(dataY) def predict(src2month, src2sloccount, src2pop, src2deps): ## Number of features feat_num = 1 ## Number of steps in the future to predict - also affects smoothing num_steps = 6 pkg_num = len(src2month) training_num = len(src2month['linux']-8) trainXdict = dict() trainYdict = dict() testXdict = dict() testYdict = dict() look_back = 3 train_size = int(len(src2month['linux']) - 20) test_size = len(src2month['linux']) - train_size batch_num = train_size - num_steps - look_back - 9 print("batch_num:") print(batch_num) # create the LSTM network model = Sequential() model.add(LSTM(20, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True)) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') scaler = MinMaxScaler(feature_range=(0, 1)) scaler2 = MinMaxScaler(feature_range=(0,1)) scaler3 = MinMaxScaler(feature_range=(0,1)) test_scale = [] for i in src2month: test_scale = numpy.concatenate((test_scale, src2month[i])) # for j in src2month[i]: # test_scale.append(src2month[i][j]) scaler.fit(test_scale) test_scale = [] for i in src2pop: test_scale.append(src2pop[i][1]) scaler2.fit(test_scale) test_scale = [] for i in src2sloccount: test_scale.append(src2sloccount[i][0]) scaler3.fit(test_scale) total_trainX = [] total_trainY = [] flag = True ################################################################################################### for i in range(0,1): # for pkg_name in src2month: # for pkg_name in ['chromium-browser']: for pkg_name in ['linux']: pkg_num = len(src2month) dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-8] print(len(dataset)) # if sum(dataset)>50: # dataset = pandas.rolling_mean(dataset, window=num_steps) ## ommit for rolling mean dataset = dataset[num_steps:] # normalize the dataset dataset = scaler.transform(dataset) train_size = len(dataset) - 20 test_size = len(dataset) - train_size train, test = dataset[0:train_size], dataset[train_size:len(dataset)] print(len(train), len(test)) # get metadata meta = [] try: pop_vote = src2pop[pkg_name][1] except KeyError: pop_vote = 0 try: slocs_total = src2sloccount[pkg_name][0] except KeyError: slocs_total = 0 pop_vote = scaler2.transform([[pop_vote]]) slocs_total = scaler3.transform([[slocs_total]]) meta.append(pop_vote) meta.append(slocs_total) # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train, meta, look_back) testX, testY = create_dataset(test, meta, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num)) testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num)) print(numpy.shape(trainX), numpy.shape(testX)) #if len(total_trainX) == 0: # total_trainX = trainX # total_trainY = trainY #else: # total_trainX = numpy.concatenate((total_trainX, trainX)) # total_trainY = numpy.concatenate((total_trainY, trainY)) # save to dict for later trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY testXdict[pkg_name], testYdict[pkg_name] = testX, testY # fit the LSTM network for i in range (3000): model.fit([trainX], [trainY], nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False) model.reset_states() ################################################################################################### # fit the LSTM network # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2) model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5') target = open('output5-3-lb' + str(look_back) + '.txt','w') target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w') # for pkg_name in ['chromium-browser']: for pkg_name in ['linux']: # for pkg_name in src2month: model.reset_states() dataset = src2month[pkg_name] dataset = dataset[:len(dataset)-8] if sum(dataset)>50: trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name] testX, testY = testXdict[pkg_name], testYdict[pkg_name] print(numpy.shape(trainX), numpy.shape(trainY)) # dataset = pandas.rolling_mean(dataset, window=num_steps) dataset = dataset[num_steps:] # normalize the dataset dataset = scaler.transform(dataset) # # make predictions trainPredict = model.predict(trainX, batch_size = batch_num) model.reset_states() new = [] test_batch_complete = [] for i in range(len(testX)): if new == []: test_batch_complete = numpy.concatenate((trainX[len(testX):],testX)) testPredict = model.predict(test_batch_complete, batch_size = batch_num) new = testPredict[len(testPredict)-3:len(testPredict)] print(new) model.reset_states() testPredict = testPredict[len(trainX)-len(testX):] print("shape: trainPredict, shape: testPredict") print(numpy.shape(trainPredict),numpy.shape(testPredict)) else: test_batch_complete = test_batch_complete[1:] print(numpy.shape(new)) print(numpy.shape(test_batch_complete)) test_batch_complete = numpy.append(test_batch_complete, [new], axis=0) testPredict = model.predict(test_batch_complete, batch_size = batch_num) new = testPredict[len(testPredict)-3:len(testPredict)] model.reset_states() testPredict = testPredict[len(trainX)-len(testX):] print("shape: trainPredict, shape: testPredict") print(numpy.shape(trainPredict),numpy.shape(testPredict)) # invert predictions trainPredict = scaler.inverse_transform(trainPredict) trainY = scaler.inverse_transform([trainY]) testPredict = scaler.inverse_transform(testPredict) testY = scaler.inverse_transform([testY]) # calculate root mean squared error print('Package: ' + pkg_name) #print(trainY) #print(trainPredict) trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0])) print('Train Score: %.2f RMSE' % (trainScore)) testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0])) print('Test Score: %.2f RMSE' % (testScore)) # shift train predictions for plotting trainPredictPlot = numpy.empty_like(dataset) trainPredictPlot[:] = numpy.nan trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0] # shift test predictions for plotting testPredictPlot = numpy.empty_like(dataset) testPredictPlot[:] = numpy.nan testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0] # plot baseline and predictions plt.plot(scaler.inverse_transform(dataset)) plt.plot(trainPredictPlot) plt.plot(testPredictPlot) plt.show() ## Calculate average for comparison # raw_av = src2month[pkg_name] # raw_av = raw_av[:len(dataset)-8] # i = 0 # while(i