Browse Source

small changes to machine_learning

Nikolaos Alexopoulos 7 years ago
parent
commit
9cd78ac5db
3 changed files with 159 additions and 24 deletions
  1. 53 14
      apt-sec.py
  2. 81 0
      machine_learning.py
  3. 25 10
      output.txt

+ 53 - 14
apt-sec.py

@@ -19,6 +19,7 @@ import numpy as np
 from dateutil import parser
 import plotly.plotly as py
 import plotly.graph_objs as go
+import machine_learning as ml
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 ## Increase the recursion limit by much to allow bs to parse large files
@@ -367,7 +368,7 @@ def updateCVETables(myid, dsatable, state, src2dsa, dsa2cve, cvetable, client):
 
 ###############################################################################
 ## Check for updates on Package information
-def aptsec_update(state, config, dsatable, client, src2dsa, dsa2cve, cvetable):
+def aptsec_update(state, config, dsatable, client, src2dsa, dsa2cve, src2month, cvetable):
     args = sys.argv
 #    if not('--offline' in args):
 #        fetchMeta('Packages')
@@ -402,7 +403,7 @@ def aptsec_update(state, config, dsatable, client, src2dsa, dsa2cve, cvetable):
     
     # recompute all pkg statistics
     for srcpkg in src2dsa:
-        processCVEs(srcpkg, now, src2dsa, dsa2cve, cvetable, config)
+        processCVEs(srcpkg, now, src2dsa, dsa2cve, src2month, cvetable, config)
     
     return 0
 ###############################################################################
@@ -423,7 +424,7 @@ def resolvePkg2Src(pkglist, pkg2src):
 ## compute and store MTBF, MTBR and Scores of each src pkg
 ## output: %src2mtbf:
 ##  (srcpkg=> ())
-def processCVEs(pkg, now, src2dsa, dsa2cve, cvetable, config):
+def processCVEs(pkg, now, src2dsa, dsa2cve, src2month, cvetable, config):
     stats = [now, 0, 0, 0, 0, 0, 0]
     mylambda = config['TRUST']['lambda']
     cvestats = dict()
@@ -465,17 +466,53 @@ def processCVEs(pkg, now, src2dsa, dsa2cve, cvetable, config):
     count = sum(cvestats.values())
 
     print(pkg + ' ' + str(count))
-    if pkg == 'chromium-browser':
-        print(src2dsa[pkg])
-        pkg_plot(pkg, cvestats)
+#    if pkg == 'chromium-browser':
+#        print(src2dsa[pkg])
+#        pkg_plot(pkg, cvestats)
+
+    format_data(pkg, cvestats, src2month)
 
-    for date in dates:
-        pass
-        ## Need to do compute value
 
     ##TODO Code to compute trust goes here
 
 
+###############################################################################
+## format vulnerability data into monthly intervals, suitable for tensorflow
+def format_data(pkg, cvestats, src2month):
+    
+    x = []
+    y = []
+    monthyear = []
+    year = []
+
+    items=list(cvestats.items())
+    items.sort(key=lambda tup: tup[0])
+
+    for data_dict in items:
+        x.append(parser.parse(data_dict[0]))
+        y.append(int(data_dict[1]))
+
+    for i in range(2000, 2017):
+        temp = []
+        for j in range(12):
+            temp.append(0)
+        monthyear.append(temp)
+
+    for i in range(len(x)):
+        monthyear[x[i].year-2000][x[i].month-1] += y[i]
+
+    months_list = [item for sublist in monthyear for item in sublist]
+
+    temp_months = np.zeros(len(months_list))
+    i = 0
+    for element in months_list:
+        temp_months[i] = np.float32(element)
+        i += 1
+
+    src2month[pkg] = temp_months
+    return
+    
+
 ###############################################################################
 ## plot vulnerability time distribution for a single package
 def pkg_plot(pkg, cvestats):
@@ -494,7 +531,7 @@ def pkg_plot(pkg, cvestats):
     monthyear = []
     year = []
     # initialize list
-    for i in range(1995,2017):
+    for i in range(2000,2017):
         temp = []
         for j in range(12):
             temp.append(0)
@@ -502,7 +539,7 @@ def pkg_plot(pkg, cvestats):
 
     for i in range(len(x)):
 #        print(str(x[i].year) + str(x[i].month))
-        monthyear[x[i].year-1995][x[i].month-1] += y[i]
+        monthyear[x[i].year-2000][x[i].month-1] += y[i]
     newx = []
     yearsx = []
     year = []
@@ -534,9 +571,9 @@ def pkg_plot(pkg, cvestats):
 
 
     for i in range(len(year)):
-        yearsx.append(i + 1995)
+        yearsx.append(i + 2000)
 
-    k = 1995
+    k = 2000
     datapoints = []
     for i in range(len(month)):
         datapoints.append(i+1)
@@ -666,6 +703,7 @@ cve_db = client.cvedb
 src2dsa = dict()
 dsa2cve = dict()
 cvetable = dict()
+src2month = dict()
 
 (state, err) = load_state()
 state['vendor'] = 'debian'
@@ -678,10 +716,11 @@ state['vendor'] = 'debian'
 if action == 'update':
     (dsatable, src2dsa, dsa2cve, cvetable) = load_DBs()
 #    loadsha1lists()
-    aptsec_update(state,config, dsatable, client, src2dsa, dsa2cve, cvetable)
+    aptsec_update(state,config, dsatable, client, src2dsa, dsa2cve, src2month, cvetable)
 #    save_sha1lists()
     save_DBs(dsatable, src2dsa, dsa2cve, cvetable)
     save_state(state)
+    ml.predict(src2month)
 elif action == 'status':
     load_DBs or exit(1)
     #handle errors more gracefully

+ 81 - 0
machine_learning.py

@@ -0,0 +1,81 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+
+import argparse
+import sys
+
+# Import data
+
+import tensorflow as tf
+
+FLAGS = None
+
+def weight_variable(shape):
+    initial = tf.truncated_normal(shape, stddev = 0.1)
+    return tf.Variable(initial)
+
+def bias_variable(shape):
+    initial = tf.constant(0.1, shape=shape)
+    return tf.Variable(initial)
+
+def conv2d(x, W):
+    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
+
+def max_pool_2x2(x):
+    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+
+def predict(src2month):
+#    mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
+
+    pkg_num = len(src2month)
+    training_num = len(src2month['linux'])-12
+
+    training_table = np.zeros((pkg_num, training_num))
+    test_values = np.zeros((pkg_num, 1))
+
+    i=0
+
+    for key, value in src2month.items():
+        training_table[i] = value[0:training_num]
+        test_values[i, 0] = np.sum(value[training_num:])
+        i += 1
+
+    
+
+
+    # Create the model
+    x = tf.placeholder(tf.float32, [None, training_num])
+    W = tf.Variable(tf.zeros([training_num, 1]))
+    b = tf.Variable(tf.zeros([1]))
+    y = tf.matmul(x, W) + b
+
+    # Define loss and optimizer
+    y_ = tf.placeholder(tf.float32, [None, 1])
+
+    # The raw formulation of cross-entropy,
+    #
+    #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)),
+    #                                 reduction_indices=[1]))
+    #
+    # can be numerically unstable.
+    #
+    # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw
+    # outputs of 'y', and then average across the batch.
+    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_))
+    train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
+
+    #sess = tf.InteractiveSession()
+    # Train
+    tf.global_variables_initializer().run()
+    for _ in range(1000):
+        sess.run(train_step, feed_dict={x: training_table, y_: test_values})
+#    for _ in range(1000):
+#        batch_xs, batch_ys = mnist.train.next_batch(100)
+#        sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
+    print(y)
+    # Test trained model
+    correct_prediction = tf.equal(y, y_)
+    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+    print(sess.run(accuracy, feed_dict={x: training_table, y_: test_values}))

File diff suppressed because it is too large
+ 25 - 10
output.txt


Some files were not shown because too many files changed in this diff