import numpy as np import matplotlib.pyplot as plt import pandas import math import sys from keras import backend as K from keras.models import Sequential from keras.layers import LSTM from keras.layers import GRU from keras import optimizers as opt from keras.layers import Dense from keras.layers import Activation, Dropout from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error import copy import random import arimaPred as arim import paper_plots as carlosplt np.random.seed(7) EPSILON = 10 ** -12 def moving_average(a, n) : ## code from Jaime ret = np.cumsum(a, dtype=float) ret[n:] = ret[n:] - ret[:-n] ret = np.concatenate((np.zeros(n-1), ret[n-1:])) return ret / n # convert an array of values into a dataset matrix # ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT def create_dataset(original_dataset, dataset, num_steps, look_back=1, feat_num=1, extra_features=[]): dataX, dataY = [], [] print('Making training length :', len(original_dataset), ' ; ', len(dataset)) for i in range(len(dataset)-look_back-num_steps +1): a = [] for j in range(i, i+look_back): if feat_num>1: a.append([original_dataset[j]]+extra_features.tolist()) else: a.append([original_dataset[j]]) dataX.append(a) mean = 0 for j in range(num_steps): mean += original_dataset[i+look_back+j] dataY.append(mean/num_steps) return np.array(dataX), np.array(dataY) def test_model(pkg_name, original_dataset, dataset, scaler, num_steps, smoothing, batch_num, look_back, test_size, mixed, feat_num=1, extra_features=[]): ## mixed is a boolean. When True, the model trained on all the packages is used. When false, each package has its own model. totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_features) K.clear_session() if mixed: new_model = load_model('./models/all_packages'+str(num_steps) + '.h5') else: new_model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5') new_model.reset_states() print(len(totalX)) totalPredict = new_model.predict(totalX[len(totalX)-batch_num:], batch_size = batch_num) del new_model #scaler = scalers[pkg_name] totalPredict = scaler.inverse_transform(totalPredict) totalPredict = totalPredict.flatten() total_LSTM = [0] * (len(dataset)-len(totalPredict)) + totalPredict.tolist() #print(len(total_LSTM)) #print(total_LSTM) #totalY = totalY[18:] totalY = totalY.reshape(-1,1) totalY = scaler.inverse_transform(totalY) totalY = totalY.flatten() temp = dataset.reshape(-1,1) temp = scaler.inverse_transform(temp) temp = temp.flatten() if (math.fabs(totalY[-1]-temp[-1])>EPSILON): print("Possible fault!!", totalY[-1], temp[-1]) return total_LSTM[:] def predict_stationary(original, dataset, num_steps): prediction = dict() for pkg in dataset: data = dataset[pkg] pred = [] i = 0 for month in data: if i50): pkglist.append(pkg) #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'wireshark', 'openjdk-8', 'mysql-transitional', 'php7.0', 'imagemagick', 'tcpdump'] #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'openjdk-8'] pkglist = ['icedove'] first = pkglist[0] #pkglist = [first] # Number of months in the future num_steps = 9 smoothing = num_steps # Test dataset size in months test_size = 18 real_test_size = 18 do_train = False dataset = dict() # Cut out end of 2018 for pkg in pkglist: ## This is for training if do_train: src2month[pkg] = src2month[pkg][:-9-real_test_size] ## This is for experiments else: src2month[pkg] = src2month[pkg][test_size:-9] (original,dataset) = load_dataset_smoothed(src2month, pkglist, smoothing) # Each point of dataset is the mean of the same point and the previous smoothing-1 of the original num_packages = len(pkglist) # Print all smoothed time-series #print_all(dataset, pkglist) ## Make simple predictions (stationary, average, waverage) predictions_list = dict() predictions_list['stationary'] = predict_stationary(original, dataset, num_steps) predictions_list['average'] = predict_average(original, dataset, num_steps) predictions_list['Waverage'] = find_best_Lamd(original, dataset, num_steps, test_size) #predictions_list['LSTM'] = predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train) #predictions_list['LSTM_all'] = predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train) #print_all_pred(dataset, predictions_list['LSTM'], pkglist, test_size) #pkglist_new=['linux','firefox-esr', 'chromium-browser', 'icedove'] pkglist_new = pkglist print_all_pred(dataset, predictions_list['Waverage'], pkglist_new, test_size) training_errors = dict() ## Dictionary of training errors e.g. training_errors['LSTM']['linux'] = XXX testing_errors = dict() ## Same for testing errors new_predictions_list = dict() ## For which packages to compute the error? for method in predictions_list: new_predictions_list[method] = dict() for pkg in predictions_list[method]: if (sum(src2month[pkg])>200): new_predictions_list[method][pkg] = predictions_list[method][pkg] print(new_predictions_list) do_training_errors(new_predictions_list, training_errors, dataset, test_size) do_testing_errors(new_predictions_list, testing_errors, dataset, test_size) ## Now among the packages again rmse. But first we normalize. Choose whether we want this or not for method in training_errors: normalize_errors(training_errors[method], pkglist, dataset, test_size) normalize_errors(testing_errors[method], pkglist, dataset, test_size) for pkg in training_errors['average']: print('#'*80) print(pkg) print('Training errors:') temp_list = [] for method in training_errors: string = method + ': ' + str(training_errors[method][pkg]) + ' , ' temp_list.append(string) print(temp_list) temp_list = [] for method in training_errors: string = method + ': ' + str(testing_errors[method][pkg]) temp_list.append(string) print('Testing errors:') print(temp_list) ## Now it is time for the rmse among the packages for method in testing_errors: testing_errors[method]['rmse'] = calculate_rmse(testing_errors[method]) testing_errors[method]['mean'] = calculate_mean(testing_errors[method]) training_errors[method]['rmse'] = calculate_rmse(training_errors[method]) training_errors[method]['mean'] = calculate_mean(training_errors[method]) print_summary(training_errors, testing_errors) return