123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- from __future__ import absolute_import
- from __future__ import division
- from __future__ import print_function
- from statsmodels.tsa.stattools import adfuller
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import statsmodels as sm
- from statsmodels.tsa.stattools import acf, pacf
- from statsmodels.tsa.arima_model import ARIMA
- from statsmodels.tsa.seasonal import seasonal_decompose
- import argparse
- import sys
- # Import data
- import tensorflow as tf
- FLAGS = None
- def test_stationarity(timeseries):
-
- #Determing rolling statistics
- rolmean = pd.rolling_mean(timeseries, window=12)
- rolstd = pd.rolling_std(timeseries, window=12)
- #Plot rolling statistics:
- orig = plt.plot(timeseries, color='blue',label='Original')
- mean = plt.plot(rolmean, color='red', label='Rolling Mean')
- std = plt.plot(rolstd, color='black', label = 'Rolling Std')
-
- plt.legend(loc='best')
- plt.title('Rolling Mean & Standard Deviation')
- plt.show()
-
- #Perform Dickey-Fuller test:
- print('Results of Dickey-Fuller Test:')
- dftest = adfuller(timeseries, autolag='AIC')
- dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
- for key,value in dftest[4].items():
- dfoutput['Critical Value (%s)'%key] = value
- print(dfoutput)
- def weight_variable(shape):
- initial = tf.truncated_normal(shape, stddev = 0.1)
- return tf.Variable(initial)
- def bias_variable(shape):
- initial = tf.constant(0.1, shape=shape)
- return tf.Variable(initial)
- def conv2d(x, W):
- return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
- def max_pool_2x2(x):
- return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
- def predict(src2month):
- df = dict()
- pkg_num = len(src2month)
- training_num = len(src2month['linux'])-12
- data = src2month['linux']
- past = data[:len(data)-12]
- print(data)
- print(past)
- data_rol = pd.rolling_mean(data, window=12)
- rolmean = pd.rolling_mean(past, window=12)
- print(rolmean)
- print(len(rolmean))
- past = rolmean[12:]
- decomposition = seasonal_decompose(past, freq=12)
- # fig = plt.figure()
- # fig = decomposition.plot()
- # fig.set_size_inches(15,8)
- # print(np.roll(past, 1))
- df['first_difference'] = past - np.roll(past, 1)
- df['first_difference'] = df['first_difference'][1:]
- df['seasonal_difference'] = past - np.roll(past, 12)
- df['seasonal_difference'] = df['seasonal_difference'][12:]
- df['seasonal_first_difference'] = df['first_difference'] - np.roll(df['first_difference'], 12)
- df['seasonal_first_difference'] = df['seasonal_first_difference'][12:]
- print(len(df['first_difference']))
-
- # test_stationarity(past)
- # test_stationarity(df['first_difference'])
- # test_stationarity(df['seasonal_difference'])
-
- # test_stationarity(df['seasonal_first_difference'])
-
- # fig = plt.figure(figsize=(12,8))
- # ax1 = fig.add_subplot(211)
- # fig = sm.graphics.tsaplots.plot_acf(df['seasonal_first_difference'], lags=40, ax=ax1)
- # ax2 = fig.add_subplot(212)
- # fig = sm.graphics.tsaplots.plot_pacf(df['seasonal_first_difference'], lags=40, ax=ax2)
- lag_acf = acf(df['seasonal_first_difference'], nlags=20)
- lag_pacf = pacf(df['seasonal_first_difference'], nlags=20, method='ols')
- #Plot ACF:
- plt.subplot(121)
- plt.plot(lag_acf)
- plt.axhline(y=0,linestyle='--',color='gray')
- plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
- plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
- plt.title('Autocorrelation Function')
- #Plot PACF:
- plt.subplot(122)
- plt.plot(lag_pacf)
- plt.axhline(y=0,linestyle='--',color='gray')
- plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
- plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
- plt.title('Partial Autocorrelation Function')
- plt.tight_layout()
- plt.show()
- mod = sm.tsa.statespace.sarimax.SARIMAX(past, trend='n', order=(2,1,1), seasonal_order=(1,1,1,12))
- results = mod.fit()
- print(results.summary())
- df['forecast'] = results.predict(start = len(past) + 1, end = len(past) + 102, dynamic= True)
- pred = np.concatenate((np.zeros(180), df['forecast']))
-
- fig = plt.figure(figsize=(12,8))
- fig = plt.plot(data_rol, color='blue')
- pred = np.concatenate((np.zeros(12), pred))
- fig = plt.plot(pred, color='green')
- print(len(data), len(past), len(pred))
- reality = sum(data[193:205])
- average = sum(data[181:193])
- predicted = pred[203]
- print('Actual vulnerabilities in 2016: ' + str(reality))
- print('Number of vulnerabilities in 2015: ' + str(average))
- print('Predicted vulnerabilities for 2016: ' + str(predicted * 12))
- print('Prediction error: ' + str(reality - predicted * 12))
- print('Difference from previous year: ' + str(reality - average))
- plt.show()
|