machine_learning.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. from __future__ import absolute_import
  2. from __future__ import division
  3. from __future__ import print_function
  4. from statsmodels.tsa.stattools import adfuller
  5. import numpy as np
  6. import pandas as pd
  7. import matplotlib.pyplot as plt
  8. import statsmodels as sm
  9. from statsmodels.tsa.stattools import acf, pacf
  10. from statsmodels.tsa.arima_model import ARIMA
  11. from statsmodels.tsa.seasonal import seasonal_decompose
  12. import argparse
  13. import sys
  14. # Import data
  15. import tensorflow as tf
  16. FLAGS = None
  17. def test_stationarity(timeseries):
  18. #Determing rolling statistics
  19. rolmean = pd.rolling_mean(timeseries, window=12)
  20. rolstd = pd.rolling_std(timeseries, window=12)
  21. #Plot rolling statistics:
  22. orig = plt.plot(timeseries, color='blue',label='Original')
  23. mean = plt.plot(rolmean, color='red', label='Rolling Mean')
  24. std = plt.plot(rolstd, color='black', label = 'Rolling Std')
  25. plt.legend(loc='best')
  26. plt.title('Rolling Mean & Standard Deviation')
  27. plt.show()
  28. #Perform Dickey-Fuller test:
  29. print('Results of Dickey-Fuller Test:')
  30. dftest = adfuller(timeseries, autolag='AIC')
  31. dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
  32. for key,value in dftest[4].items():
  33. dfoutput['Critical Value (%s)'%key] = value
  34. print(dfoutput)
  35. def weight_variable(shape):
  36. initial = tf.truncated_normal(shape, stddev = 0.1)
  37. return tf.Variable(initial)
  38. def bias_variable(shape):
  39. initial = tf.constant(0.1, shape=shape)
  40. return tf.Variable(initial)
  41. def conv2d(x, W):
  42. return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
  43. def max_pool_2x2(x):
  44. return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
  45. def predict(src2month):
  46. df = dict()
  47. pkg_num = len(src2month)
  48. training_num = len(src2month['linux'])-12
  49. data = src2month['linux']
  50. past = data[:len(data)-12]
  51. print(data)
  52. print(past)
  53. data_rol = pd.rolling_mean(data, window=12)
  54. rolmean = pd.rolling_mean(past, window=12)
  55. print(rolmean)
  56. print(len(rolmean))
  57. past = rolmean[12:]
  58. decomposition = seasonal_decompose(past, freq=12)
  59. # fig = plt.figure()
  60. # fig = decomposition.plot()
  61. # fig.set_size_inches(15,8)
  62. # print(np.roll(past, 1))
  63. df['first_difference'] = past - np.roll(past, 1)
  64. df['first_difference'] = df['first_difference'][1:]
  65. df['seasonal_difference'] = past - np.roll(past, 12)
  66. df['seasonal_difference'] = df['seasonal_difference'][12:]
  67. df['seasonal_first_difference'] = df['first_difference'] - np.roll(df['first_difference'], 12)
  68. df['seasonal_first_difference'] = df['seasonal_first_difference'][12:]
  69. print(len(df['first_difference']))
  70. # test_stationarity(past)
  71. # test_stationarity(df['first_difference'])
  72. # test_stationarity(df['seasonal_difference'])
  73. # test_stationarity(df['seasonal_first_difference'])
  74. # fig = plt.figure(figsize=(12,8))
  75. # ax1 = fig.add_subplot(211)
  76. # fig = sm.graphics.tsaplots.plot_acf(df['seasonal_first_difference'], lags=40, ax=ax1)
  77. # ax2 = fig.add_subplot(212)
  78. # fig = sm.graphics.tsaplots.plot_pacf(df['seasonal_first_difference'], lags=40, ax=ax2)
  79. lag_acf = acf(df['seasonal_first_difference'], nlags=20)
  80. lag_pacf = pacf(df['seasonal_first_difference'], nlags=20, method='ols')
  81. #Plot ACF:
  82. plt.subplot(121)
  83. plt.plot(lag_acf)
  84. plt.axhline(y=0,linestyle='--',color='gray')
  85. plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
  86. plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
  87. plt.title('Autocorrelation Function')
  88. #Plot PACF:
  89. plt.subplot(122)
  90. plt.plot(lag_pacf)
  91. plt.axhline(y=0,linestyle='--',color='gray')
  92. plt.axhline(y=-1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
  93. plt.axhline(y=1.96/np.sqrt(len(df['seasonal_first_difference'])),linestyle='--',color='gray')
  94. plt.title('Partial Autocorrelation Function')
  95. plt.tight_layout()
  96. plt.show()
  97. mod = sm.tsa.statespace.sarimax.SARIMAX(past, trend='n', order=(2,1,1), seasonal_order=(1,1,1,12))
  98. results = mod.fit()
  99. print(results.summary())
  100. df['forecast'] = results.predict(start = len(past) + 1, end = len(past) + 102, dynamic= True)
  101. pred = np.concatenate((np.zeros(180), df['forecast']))
  102. fig = plt.figure(figsize=(12,8))
  103. fig = plt.plot(data_rol, color='blue')
  104. pred = np.concatenate((np.zeros(12), pred))
  105. fig = plt.plot(pred, color='green')
  106. print(len(data), len(past), len(pred))
  107. reality = sum(data[193:205])
  108. average = sum(data[181:193])
  109. predicted = pred[203]
  110. print('Actual vulnerabilities in 2016: ' + str(reality))
  111. print('Number of vulnerabilities in 2015: ' + str(average))
  112. print('Predicted vulnerabilities for 2016: ' + str(predicted * 12))
  113. print('Prediction error: ' + str(reality - predicted * 12))
  114. print('Difference from previous year: ' + str(reality - average))
  115. plt.show()