123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265 |
- import numpy
- import matplotlib.pyplot as plt
- import pandas
- import math
- from keras.models import Sequential
- from keras.layers import Dense
- from keras.layers import LSTM
- from keras.layers import Activation, Dropout
- from keras.models import load_model
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.metrics import mean_squared_error
- numpy.random.seed(7)
- # convert an array of values into a dataset matrix
- def create_dataset(dataset, meta, look_back=1):
- dataX, dataY = [], []
- for i in range(len(dataset)-look_back-1):
- a = []
- for j in range(i, i+look_back):
- #a.append([dataset[j]] + meta)
- a.append([dataset[j]])
- dataX.append(a)
- dataY.append(dataset[i + look_back])
- return numpy.array(dataX), numpy.array(dataY)
- def predict(src2month, src2sloccount, src2pop, src2deps):
-
- ## Number of features
- feat_num = 1
-
- ## Number of steps in the future to predict - also affects smoothing
- num_steps = 6
- pkg_num = len(src2month)
- training_num = len(src2month['linux']-8)
- trainXdict = dict()
- trainYdict = dict()
- testXdict = dict()
- testYdict = dict()
- look_back = 3
- train_size = int(len(src2month['linux']) - 20)
- test_size = len(src2month['linux']) - train_size
- batch_num = train_size - num_steps - look_back - 9
- print("batch_num:")
- print(batch_num)
-
- # create the LSTM network
- model = Sequential()
- model.add(LSTM(20, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout_W=0.5, stateful=True))
- model.add(Dense(1))
- model.compile(loss='mean_squared_error', optimizer='adam')
-
- scaler = MinMaxScaler(feature_range=(0, 1))
-
- scaler2 = MinMaxScaler(feature_range=(0,1))
- scaler3 = MinMaxScaler(feature_range=(0,1))
-
- test_scale = []
- for i in src2month:
- test_scale = numpy.concatenate((test_scale, src2month[i]))
- # for j in src2month[i]:
- # test_scale.append(src2month[i][j])
- scaler.fit(test_scale)
-
- test_scale = []
- for i in src2pop:
- test_scale.append(src2pop[i][1])
- scaler2.fit(test_scale)
- test_scale = []
-
- for i in src2sloccount:
- test_scale.append(src2sloccount[i][0])
-
- scaler3.fit(test_scale)
- total_trainX = []
- total_trainY = []
- flag = True
- ###################################################################################################
- for i in range(0,1):
- # for pkg_name in src2month:
- # for pkg_name in ['chromium-browser']:
- for pkg_name in ['linux']:
- pkg_num = len(src2month)
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-8]
- print(len(dataset))
- #
- if sum(dataset)>50:
- #
- dataset = pandas.rolling_mean(dataset, window=num_steps)
- ## ommit for rolling mean
- dataset = dataset[num_steps:]
- # normalize the dataset
- dataset = scaler.transform(dataset)
- train_size = len(dataset) - 20
- test_size = len(dataset) - train_size
- train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
- print(len(train), len(test))
- # get metadata
- meta = []
- try:
- pop_vote = src2pop[pkg_name][1]
- except KeyError:
- pop_vote = 0
- try:
- slocs_total = src2sloccount[pkg_name][0]
- except KeyError:
- slocs_total = 0
-
- pop_vote = scaler2.transform([[pop_vote]])
- slocs_total = scaler3.transform([[slocs_total]])
- meta.append(pop_vote)
- meta.append(slocs_total)
- # reshape into X=t and Y=t+1
- trainX, trainY = create_dataset(train, meta, look_back)
- testX, testY = create_dataset(test, meta, look_back)
- # reshape input to be [samples, time steps, features]
- trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
- testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
- print(numpy.shape(trainX), numpy.shape(testX))
- #if len(total_trainX) == 0:
- # total_trainX = trainX
- # total_trainY = trainY
- #else:
- # total_trainX = numpy.concatenate((total_trainX, trainX))
- # total_trainY = numpy.concatenate((total_trainY, trainY))
- # save to dict for later
- trainXdict[pkg_name], trainYdict[pkg_name] = trainX, trainY
- testXdict[pkg_name], testYdict[pkg_name] = testX, testY
-
- # fit the LSTM network
- for i in range (3000):
- model.fit([trainX], [trainY], nb_epoch=1, batch_size=len(trainX), verbose=2, shuffle=False)
- model.reset_states()
-
- ###################################################################################################
- # fit the LSTM network
- # model.fit([total_trainX], [total_trainY], nb_epoch=50, batch_size=1000, verbose=2)
- model.save('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- # model = load_model('all_packages_test'+str(num_steps)+ '-' + str(feat_num) + '.h5')
- target = open('output5-3-lb' + str(look_back) + '.txt','w')
- target2 = open('REAL_LSTM_results' + str(look_back) + '.txt','w')
- # for pkg_name in ['chromium-browser']:
- for pkg_name in ['linux']:
- # for pkg_name in src2month:
-
- model.reset_states()
- dataset = src2month[pkg_name]
- dataset = dataset[:len(dataset)-8]
- if sum(dataset)>50:
-
- trainX, trainY = trainXdict[pkg_name], trainYdict[pkg_name]
- testX, testY = testXdict[pkg_name], testYdict[pkg_name]
- print(numpy.shape(trainX), numpy.shape(trainY))
- #
- dataset = pandas.rolling_mean(dataset, window=num_steps)
- dataset = dataset[num_steps:]
- # normalize the dataset
- dataset = scaler.transform(dataset)
- # # make predictions
- trainPredict = model.predict(trainX, batch_size = batch_num)
- model.reset_states()
- new = []
- test_batch_complete = []
- for i in range(len(testX)):
- if new == []:
- test_batch_complete = numpy.concatenate((trainX[len(testX):],testX))
- testPredict = model.predict(test_batch_complete, batch_size = batch_num)
- new = testPredict[len(testPredict)-3:len(testPredict)]
- print(new)
- model.reset_states()
- testPredict = testPredict[len(trainX)-len(testX):]
- print("shape: trainPredict, shape: testPredict")
- print(numpy.shape(trainPredict),numpy.shape(testPredict))
- else:
- test_batch_complete = test_batch_complete[1:]
- print(numpy.shape(new))
- print(numpy.shape(test_batch_complete))
- test_batch_complete = numpy.append(test_batch_complete, [new], axis=0)
- testPredict = model.predict(test_batch_complete, batch_size = batch_num)
- new = testPredict[len(testPredict)-3:len(testPredict)]
- model.reset_states()
- testPredict = testPredict[len(trainX)-len(testX):]
- print("shape: trainPredict, shape: testPredict")
- print(numpy.shape(trainPredict),numpy.shape(testPredict))
-
-
-
- # invert predictions
- trainPredict = scaler.inverse_transform(trainPredict)
- trainY = scaler.inverse_transform([trainY])
- testPredict = scaler.inverse_transform(testPredict)
- testY = scaler.inverse_transform([testY])
-
-
- # calculate root mean squared error
- print('Package: ' + pkg_name)
- #print(trainY)
- #print(trainPredict)
- trainScore = math.sqrt(mean_squared_error(trainY[0], trainPredict[:,0]))
- print('Train Score: %.2f RMSE' % (trainScore))
- testScore = math.sqrt(mean_squared_error(testY[0], testPredict[:,0]))
- print('Test Score: %.2f RMSE' % (testScore))
- # shift train predictions for plotting
- trainPredictPlot = numpy.empty_like(dataset)
- trainPredictPlot[:] = numpy.nan
- trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:, 0]
- # shift test predictions for plotting
- testPredictPlot = numpy.empty_like(dataset)
- testPredictPlot[:] = numpy.nan
- testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:, 0]
- # plot baseline and predictions
- plt.plot(scaler.inverse_transform(dataset))
- plt.plot(trainPredictPlot)
- plt.plot(testPredictPlot)
- plt.show()
-
-
- ## Calculate average for comparison
- # raw_av = src2month[pkg_name]
- # raw_av = raw_av[:len(dataset)-8]
- # i = 0
-
- # while(i<len(raw_av) and raw_av[i]<1 ):
- # i = i + 1
- #
- # raw_av = raw_av[i:]
- # if(len(raw_av) == 0):
- # average = 0
- # else:
- # average = num_steps * int(round(sum(raw_av) / float(len(raw_av))))
- #
- # ## Write to file
- # if(actual != 0):
- # norm_diff = abs(actual-predicted)/actual
- # else:
- # norm_diff = actual-predicted
- # target2.write(pkg_name + ',' + str(actual) + ',' + str(predicted) + ',' + str(average) + ',' + str(norm_diff) + '\n')
|