Nikolaos Alexopoulos 6 rokov pred
rodič
commit
b9b2748fc2
2 zmenil súbory, kde vykonal 1082 pridanie a 132 odobranie
  1. 309 132
      apt-sec.py
  2. 773 0
      prediction.py

+ 309 - 132
apt-sec.py

@@ -6,7 +6,7 @@ import sys
 import os
 from pymongo import MongoClient
 #mongodb assumes database at default path
-import logging, sys
+import logging
 import configparser
 import json
 import csv
@@ -19,7 +19,7 @@ import numpy as np
 from dateutil import parser
 import plotly.plotly as py
 import plotly.graph_objs as go
-import lstm_reg as lstm
+#import lstm_reg as lstm
 import metadata as meta
 import deps
 import psycopg2
@@ -27,7 +27,10 @@ import powerlaw as pl
 import DLAmine as dla
 import pickle
 import paper_plots as carlosplt
+import stat_tests as stats
 from matplotlib2tikz import save as tikz_save
+import prediction as pred
+import scipy.stats as stats
 
 logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
 ## Increase the recursion limit by much to allow bs to parse large files
@@ -119,6 +122,7 @@ def load_DBs():
     src2sloccount = dict()
     src2pop = dict()
     src2deps = dict()
+    pkg_with_cvss = dict()
 
     cache = config['DIR']['cache_dir']
     
@@ -166,6 +170,13 @@ def load_DBs():
     except (IOError, ValueError):
         print('read cache src2month failed!! Maybe first run of the system?')
     
+    cache_pkg_with_cvss = cache + 'pkg_with_cvss'
+    try:
+        with open(cache_pkg_with_cvss) as fp:
+            pkg_with_cvss = json.load(fp)
+    except (IOError, ValueError):
+        print('read cache pkg_with_cvss failed!! Maybe first run of the system?')
+    
     cache_src2sloccount = cache + 'src2sloccount'
     try:
         with open(cache_src2sloccount) as fp:
@@ -180,7 +191,7 @@ def load_DBs():
     except (IOError, ValueError):
         print('read cache src2pop failed!! Maybe first run of the system?')
     
-    return(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps)
+    return(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, pkg_with_cvss)
 
 
 ###############################################################################
@@ -192,7 +203,7 @@ def myconverter(o):
         return o.astype(int)
 ###############################################################################
 ## save to files
-def save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum):
+def save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum, pkg_with_cvss):
     cache = config['DIR']['cache_dir']
     
     cache_dsatable = cache + 'dsatable'
@@ -276,6 +287,24 @@ def save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src
         print('write cache src2month failed!! Fatal error')
         sys.exit(1)
 
+    cache_pkg_with_cvss = cache + 'pkg_with_cvss'
+    int_list = dict()
+    
+    for element in pkg_with_cvss:
+        for i in range(len(pkg_with_cvss[element])):
+            if element in int_list:
+                int_list[element].append(pkg_with_cvss[element][i])
+            else:
+                int_list[element] = []
+                int_list[element].append(pkg_with_cvss[element][i])
+    try:
+        with open(cache_pkg_with_cvss, 'w') as fp:
+            json.dump(int_list, fp, default = myconverter)
+    except IOError:
+        print('write cache pkg_with_cvss failed!! Fatal error')
+        sys.exit(1)
+
+
 ###############################################################################
 ## Fetch current Packages, Sources and sha1sums files
 ## These are needed to find CVE stats by sha1sums/pkg-names
@@ -518,13 +547,14 @@ def resolvePkg2Src(pkglist, pkg2src):
 ##  (srcpkg=> ())
 def processCVEs(pkg, now, src2dsa, dsa2cve, src2month, cvetable, pkg_with_cvss, config):
     stats = [now, 0, 0, 0, 0, 0, 0]
-    mylambda = config['TRUST']['lambda']
+    #mylambda = config['TRUST']['lambda']
+    mylambda = 0
     cvestats = dict()
     logging.info('Processing package: ' + pkg + '.\n')
 
     ## keep track of the number of low-medium-high severity vulnerabilities
     ## TODO see how cvss affects vulnerability prediction - if some packages show patterns
-    temp_cvss = 0.0
+    temp_cvss = 10.0
     with_cvss = dict()
 
     ## To eliminate duplicate cves
@@ -536,7 +566,7 @@ def processCVEs(pkg, now, src2dsa, dsa2cve, src2month, cvetable, pkg_with_cvss,
             if cve_id in haveseen:
                 continue
             else:
-                haveseen[cve_id]=1
+                haveseen[cve_id] = 1
                 tt = cvetable[cve_id][0]
                 if tt in cvestats:
                     cvestats[tt] += 1
@@ -547,29 +577,39 @@ def processCVEs(pkg, now, src2dsa, dsa2cve, src2month, cvetable, pkg_with_cvss,
     ## Date at the moment taken from CVE? - not sure.
 
     ## with_cvss = (date: number low, number med, number high)
+    haveseen = dict()
     for dsa_id in src2dsa[pkg]:
         for cve_id in dsa2cve[str(dsa_id)]:
-            tt = cvetable[cve_id][0]
-            try: temp_cvss = float(cvetable[cve_id][2])
-            except TypeError:
-                print(cve_id)
+            if cve_id in haveseen:
                 continue
-
-            if tt in with_cvss:
-                if (temp_cvss<4.0):
-                    with_cvss[tt][0] += 1
-                elif (temp_cvss<7.0):
-                    with_cvss[tt][1] += 1
-                else:
-                    with_cvss[tt][2] += 1
             else:
-                with_cvss[tt] = [0, 0, 0]
-                if (temp_cvss<4.0):
-                    with_cvss[tt][0] += 1
-                elif (temp_cvss<7.0):
-                    with_cvss[tt][1] += 1
+                haveseen[cve_id] = 1
+                tt = cvetable[cve_id][0]
+                try: temp_cvss = float(cvetable[cve_id][2])
+                except TypeError:
+                    print(cve_id)
+                    continue
+                if pkg=='linux':
+                    print(tt, temp_cvss)
+                
+                if tt in with_cvss:
+                    if (temp_cvss<4.0):
+                        with_cvss[tt][0] += 1
+                    elif (temp_cvss<7.0):
+                        with_cvss[tt][1] += 1
+                    else:
+                        with_cvss[tt][2] += 1
                 else:
-                    with_cvss[tt][2] += 1
+                    with_cvss[tt] = [0, 0, 0]
+                    if (temp_cvss<4.0):
+                        with_cvss[tt][0] += 1
+                    elif (temp_cvss<7.0):
+                        with_cvss[tt][1] += 1
+                    else:
+                        with_cvss[tt][2] += 1
+
+    if pkg=='linux':
+        print(with_cvss)
 
 
     # Ignore pkgs with less than one incident, should not happen..
@@ -591,7 +631,7 @@ def processCVEs(pkg, now, src2dsa, dsa2cve, src2month, cvetable, pkg_with_cvss,
 
     print(pkg + ' ' + str(count))
 
-#    pkg_with_cvss[pkg] = with_cvss
+    #pkg_with_cvss[pkg] = with_cvss
     format_data(pkg, with_cvss, pkg_with_cvss, True)
 
     format_data(pkg, cvestats, src2month, False)
@@ -624,7 +664,7 @@ def format_data(pkg, cvestats, src2month, cvss):
     
     items.sort(key=lambda tup: tup[0])
 
-    for i in range(2000, 2018):
+    for i in range(2000, 2019):
         temp = []
         for j in range(12):
             if cvss:
@@ -748,32 +788,45 @@ def pkg_plot(pkg, cvestats):
 ###############################################################################
 ## populate src2sloccount dictionary with number of source lines of code in 
 ## format (total, [ansic, cpp, asm, java, python, perl, sh])
-def getslocs(src2dsa, src2sloccount):
+def getslocs(src2month, src2sloccount):
+    with open('./sloc_report.txt') as f:
+        content = f.readlines()
+        for i in content:
+            (total, ansic, cpp, asm, java, python, perl, sh) = (0, 0, 0, 0, 0, 0, 0, 0)
+            words=i.split()
+            total = int(words[0])
+            pkg = words[1]
+            for w in words[2:]:
+                ww = w.split('=')
+                if ww[0] == 'ansic':
+                    ansic = int(ww[1])
+                if ww[0] == 'cpp':
+                    cpp = int(ww[1])
+                if ww[0] == 'asm':
+                    asm = int(ww[1])
+                if ww[0] == 'java':
+                    java = int(ww[1])
+                if ww[0] == 'python':
+                    python = int(ww[1])
+                if ww[0] == 'perl':
+                    perl = int(ww[1])
+                if ww[0] == 'sh':
+                    sh = int(ww[1])
+            src2sloccount[pkg] = (total, [ansic, cpp, asm, java, python, perl, sh])
 
-    try:
-        conn = psycopg2.connect("dbname = 'debsources' user = 'postgres' host = 'localhost' password = 'nik'")
-    except:
-        print('I am unable to connect to the database')
-    
-    cur = conn.cursor()
 
-    for pkg in src2dsa:
-        if pkg not in src2sloccount:
-            print(pkg)
-            src2sloccount[pkg] = meta.getsloccount(cur, pkg)
 
-    return
 
 ###############################################################################
 ## get popularity contest data in format src_pkg -> (installed, vote, old, recent)
 def getpop(src2dsa, src2pop):
 
-    with open('Debian_pop.csv', newline = '') as csvfile:
+    with open('by_vote.csv', newline = '') as csvfile:
         reader = csv.reader(csvfile, delimiter = ',', quotechar = '|')
         for row in reader:
             try:
-                if row[1] in src2dsa and not (row[1] in src2pop):
-                    src2pop[row[1]] = row[2:6]
+                if row[1] in src2dsa:
+                    src2pop[row[1]] = row[3]
             except IndexError:
                 print(row)
                 continue
@@ -822,27 +875,87 @@ def simulate_stats(pkg, year):
 
 ###############################################################################
 ##TODO Printing functions
-def plot_all(src2month):
+def plot_all(src2month, src2sloccount, pkg_with_cvss):
     ## Sum of vulnerabilities by package
     src2sum = dict()
     src2year = dict()
-    years = 16  # 2001 - 2000 + years
+    src2month_loc=dict()
+    src2lastyears = dict()
+    src2dens = dict()
+
+    src2month_temp = dict()
+
+    for i in pkg_with_cvss:
+        src2month_temp[i]=[]
+        for j in range(len(src2month[i])):
+            #src2month_temp[i].append(pkg_with_cvss[i][j][1]+pkg_with_cvss[i][j][2])
+            src2month_temp[i].append(pkg_with_cvss[i][j][2])
+
+    for i in src2month:
+        src2month_loc[i]=src2month_temp[i][:-12] #cut data for 2018
+
+    years = 17  # 2001 - 2000 + years
     year_sum = [0] * years
     year_num = [0] * years
-    for pkg in src2month:
+    for pkg in src2month_loc:
         for j in range(years):
-            temp =  sum(src2month[pkg][12*(1+j):12*(2+j)])
+            temp =  sum(src2month_loc[pkg][12*(1+j):12*(2+j)])
             if (temp>0):
                 year_num[j] += 1
             year_sum[j] += temp 
-        total = sum(src2month[pkg][12:])
-        last_year = sum(src2month[pkg][-12:])
-        print(pkg + '; ' + str(last_year))
+        ## For last 2 years
+        total = sum(src2month_loc[pkg][:])
+        last_years = sum(src2month_loc[pkg][-24:])
+        print(pkg + '; ' + str(last_years))
         if (total>1):
             src2sum[pkg] = total
+            src2lastyears[pkg] = last_years
     
+    #calc total
+    sum_total = 0
+    one_only=0
+    one_plus=0
+    for p in src2month:
+        sum_part = sum(src2month_loc[p][:])
+        sum_total += sum_part
+        if (sum_part == 1):
+            one_only += 1
+        elif (sum_part>1):
+            one_plus += 1
+
+    print('Total last 2 years = ', sum_total)
+    print('one_only = ', one_only)
+    print('one_plus = ', one_plus)
+
     values = sorted(src2sum.values(),reverse=True)
+    #print(values)
     keys = list(sorted(src2sum, key=src2sum.__getitem__, reverse=True))
+    
+    density = []
+    density_keys=[]
+    size = []
+    size_dens = []
+
+    for pkg in keys:
+        try:
+            size.append(src2sloccount[pkg][0]/1000)
+        except (KeyError):
+            size.append(0)
+
+
+    j=0
+
+    for pkg in keys:
+        try:
+            if (src2sloccount[pkg][0])>0:
+                density.append((values[j]/src2sloccount[pkg][0])*1000)
+                density_keys.append(pkg)
+                src2dens[pkg] = (values[j]/src2sloccount[pkg][0])*1000
+                size_dens.append(src2sloccount[pkg][0])
+        except(KeyError):
+            pass
+        j += 1
+    
     i = 0
     few_keys = []
     #print(keys)
@@ -857,19 +970,85 @@ def plot_all(src2month):
     carlosplt.pre_paper_plot(True)
     #plt.style.use('ggplot')
 
+    print('Spearman correlation: ',stats.spearmanr(values,size))
+    with open('sizes.txt', 'w') as thefile:
+        for item in size:
+            thefile.write("%.3f\n" % item)
+
     plt.figure(figsize=(10,5))
-    plt.plot([1000] + values, color='darkblue', lw = 2)
-    plt.xticks(np.arange(1,len(src2sum),10.0)+1,few_keys, rotation="vertical")
+    plt.plot(values, color='darkblue', lw = 2)
+    #plt.plot(size, 'ro', color='darkred', lw = 2, label='Size in KSLoC')
+    plt.xticks(np.arange(0,len(src2sum),10.0),few_keys, rotation="vertical")
     plt.ylabel('Vulnerabilities')
     plt.yscale('log')
+    plt.grid()
     #plt.xscale('log')
     plt.tight_layout()
+    plt.legend()
     carlosplt.post_paper_plot(True,True,True)
-    tikz_save('line.tex', figureheight='\\figureheight', figurewidth='\\figurewidth')
     plt.show()
 
     print('Yearly vulnerabilites in total' + str(year_sum))
 
+    src2sloc = dict()
+    for pkg in src2sloccount:
+        src2sloc[pkg] = src2sloccount[pkg][0]
+
+    ## Density
+    density = sorted(src2dens.values(),reverse=True)
+    with open('densities.txt', 'w') as thefile:
+        for item in density:
+            thefile.write("%.3f\n" % item)
+    density_keys = list(sorted(src2dens, key=src2dens.__getitem__, reverse=True))
+    density_few_keys =[]
+    for k in density_keys:
+        if (i==0):
+            density_few_keys.append(k)
+        i+=1
+        if (i==10):
+            i = 0
+
+    plt.figure(figsize=(10,5))
+    plt.plot(size_dens, density, 'ro', color='darkblue', lw = 2)
+    plt.xticks(np.arange(0,len(density),10.0),density_few_keys, rotation="vertical")
+    plt.ylabel('Vulnerability density')
+    plt.yscale('log')
+    plt.xscale('log')
+    plt.tight_layout()
+    carlosplt.post_paper_plot(True,True,True)
+    plt.show()
+
+    ## Spearman density size
+    print('Spearman correlation: ',stats.spearmanr(density,size_dens))
+
+
+    ## SLoCs
+    values = sorted(src2sloc.values(),reverse=True)
+    #print(values)
+    keys = list(sorted(src2sloc, key=src2sloc.__getitem__, reverse=True))
+    
+    i = 0
+    few_keys = []
+    for k in keys:
+        if (i==0):
+            few_keys.append(k)
+        i+=1
+        if (i==10):
+            i = 0
+
+    carlosplt.pre_paper_plot(True)
+
+    plt.figure(figsize=(10,5))
+    plt.plot(values, color='darkblue', lw = 2)
+    plt.xticks(np.arange(0,len(src2sloc),10.0),few_keys, rotation="vertical")
+    plt.ylabel('SLoC')
+    plt.yscale('log')
+    plt.xscale('log')
+    plt.tight_layout()
+    carlosplt.post_paper_plot(True,True,True)
+    plt.show()
+
+
     ## Number of affected packages
     n = len(year_sum)
     yearsx = []
@@ -892,9 +1071,9 @@ def plot_all(src2month):
 
     
 
-    print(average_per_year)
+    #print(average_per_year)
     x_values = list(range(1,years+1))
-    print(x_values)
+    #print(x_values)
     slope = np.polyfit(x_values,average_per_year,1)
 
     #slope = np.polyval(slope,x_values)
@@ -918,15 +1097,17 @@ def plot_all(src2month):
     
     quarter_num = years*4 
     
-    pkg = 'php5'
-    quarter_sum = [0] * quarter_num
-    for j in range(quarter_num):
-        temp =  sum(src2month[pkg][12+3*j:12+3*(j+1)])
-        quarter_sum[j] = temp
-    src2quarter[pkg] = quarter_sum
-
-    for pkg in src2quarter:
-        n = len(src2quarter[pkg])
+    # Here for only up to 2016 - let's change that
+    #return(src2sum)
+#    pkg = 'php5'
+#    quarter_sum = [0] * quarter_num
+#    for j in range(quarter_num):
+#        temp =  sum(src2month_loc[pkg][12+3*j:12+3*(j+1)])
+#        quarter_sum[j] = temp
+#    src2quarter[pkg] = quarter_sum
+
+#    for pkg in src2quarter:
+#        n = len(src2quarter[pkg])
     quartersx = []
     for i in range(1,years+1):
         for j in range(1,5):
@@ -937,55 +1118,56 @@ def plot_all(src2month):
 
 
 
-    x = range(quarter_num)
-    width = 1/2
+#    x = range(quarter_num)
+#    width = 1/2
     ## Plot different colors for php
-    before = src2quarter[pkg][:-8] + ([0] * 8)
-    after = ([0] * (len(before)-8)) + src2quarter[pkg][-8:]
-    print(len(src2quarter[pkg]))
+#    before = src2quarter[pkg][:-8] + ([0] * 8)
+#    after = ([0] * (len(before)-8)) + src2quarter[pkg][-8:]
+#    print(len(src2quarter[pkg]))
 #
-    bar1 = plt.bar(x[:-26], before[24:-2], width, color='darkblue', label='before php7', edgecolor='black')
-    bar2 = plt.bar(x[:-26], after[24:-2], width, color='darkred', label='after php7', edgecolor='black')
-    plt.legend(handles=[bar1, bar2])
+#    bar1 = plt.bar(x[:-26], before[24:-2], width, color='darkblue', label='before php7', edgecolor='black')
+#    bar2 = plt.bar(x[:-26], after[24:-2], width, color='darkred', label='after php7', edgecolor='black')
+#    plt.legend(handles=[bar1, bar2])
 #
-    print('PHP Sum before: ' + str(sum(before)))
-    print('PHP Sum after: ' + str(sum(after)))
+#    print('PHP Sum before: ' + str(sum(before)))
+#    print('PHP Sum after: ' + str(sum(after)))
    
-    plt.xticks(np.arange(0,n-26),quartersx[24:-2], rotation="vertical")
-    plt.ylabel('Vulnerabilities per quarter of package ' + pkg)
-    plt.xlabel('Quarter')
-    carlosplt.post_paper_plot(True,True,True)
-    plt.show()
+#    plt.xticks(np.arange(0,n-26),quartersx[24:-2], rotation="vertical")
+#    plt.ylabel('Vulnerabilities per quarter of package ' + pkg)
+#    plt.xlabel('Quarter')
+#    carlosplt.post_paper_plot(True,True,True)
+#    plt.show()
 
 #    ## Plot for openjdk-7
-    pkg = 'openjdk-7'
-    quarter_sum = [0] * quarter_num
-    for j in range(quarter_num):
-        temp =  sum(src2month[pkg][12+3*j:12+3*(j+1)])
-        quarter_sum[j] = temp
-    src2quarter[pkg] = quarter_sum
-
-    n = len(src2quarter[pkg])
-    x = range(quarter_num)
-    width = 1/2
+    #pkg = 'openjdk-8'
+    #pkg = 'openjdk-7'
+    #quarter_sum = [0] * quarter_num
+    #for j in range(quarter_num):
+    #    temp =  sum(src2month_loc[pkg][12+3*j:12+3*(j+1)])
+    #    quarter_sum[j] = temp
+    #src2quarter[pkg] = quarter_sum
+
+    #n = len(src2quarter[pkg])
+    #x = range(quarter_num)
+    #width = 1/2
 #    ## Plot different colors for openjdk
-    before = src2quarter[pkg][:-10] + ([0] * 10)
-    after = ([0] * (len(before)-10)) + src2quarter[pkg][-10:]
-    print(len(src2quarter[pkg]))
+    #before = src2quarter[pkg][:-10] + ([0] * 10)
+    #after = ([0] * (len(before)-10)) + src2quarter[pkg][-10:]
+    #print(len(src2quarter[pkg]))
 
-    bar1 = plt.bar(x[:-48], before[48:], width, color='darkblue', label='before openjdk-8', edgecolor='black')
-    bar2 = plt.bar(x[:-48], after[48:], width, color='darkred', label='after openjdk-8', edgecolor='black')
-    plt.legend(handles=[bar1, bar2])
+    #bar1 = plt.bar(x[:-48], before[48:], width, color='darkblue', label='before openjdk-8', edgecolor='black')
+    #bar2 = plt.bar(x[:-48], after[48:], width, color='darkred', label='after openjdk-8', edgecolor='black')
+    #plt.legend(handles=[bar1, bar2])
     
-    print('OpenJDK Sum before: ' + str(sum(before)))
-    print('OpenJDK Sum after: ' + str(sum(after)))
+    #print('OpenJDK Sum before: ' + str(sum(before)))
+    #print('OpenJDK Sum after: ' + str(sum(after)))
         
     #plt.bar(x, src2quarter[pkg], width, color='red')
-    plt.xticks(np.arange(0,n-48),quartersx[48:], rotation="vertical")
-    plt.ylabel('Vulnerabilities per quarter of package ' + pkg)
-    plt.xlabel('Quarter')
-    carlosplt.post_paper_plot(True,True,True)
-    plt.show()
+    #plt.xticks(np.arange(0,n-48),quartersx[48:], rotation="vertical")
+    #plt.ylabel('Vulnerabilities per quarter of package ' + pkg)
+    #plt.xlabel('Quarter')
+    #carlosplt.post_paper_plot(True,True,True)
+    #plt.show()
 
 
 
@@ -1002,8 +1184,8 @@ def plot_all(src2month):
     carlosplt.post_paper_plot(True,True,True)
     plt.show()
 
-    sumall = sum(values)
-    print(sumall)
+    sum_all = sum(values)
+    print("Total: ", sum_all)
     ###############################################################################################
    
     # Get LTS and plot
@@ -1013,14 +1195,15 @@ def plot_all(src2month):
     except IOError:
         ltslist = dla.getDLAs()
 
+    print(ltslist)
     ## Plot for wheezy
-    
+    quarter_num += 1
     quarter_sum = [0] * quarter_num
     totalLTS = [0] * (14 * 12) + ltslist
     
-    for pkg in src2month:
+    for pkg in src2month_loc:
         for j in range(quarter_num):
-            temp =  sum(src2month[pkg][12+(3*j):12+3*(j+1)])
+            temp =  sum(src2month_loc[pkg][12+(3*j):12+3*(j+1)])
             quarter_sum[j] += temp 
 
     LTS_quarter = []
@@ -1029,6 +1212,8 @@ def plot_all(src2month):
         temp = sum(totalLTS[12+(3*j):12+3*(j+1)])
         LTS_quarter.append(temp)
 
+    quartersx.append("Q1'18")
+
     ## Print all LTS
     cut = 12*4+1
     n = len(quarter_sum)
@@ -1047,25 +1232,23 @@ def plot_all(src2month):
     
     
     ## Filter only wheezy:
-    quarter_sum_regular = [0] * (12*4+1) + quarter_sum[12*4+1:12*4+9] + [0] * 11
-    quarter_sum_errors = [0] * (12*4 + 9) + quarter_sum[12*4+9:12*4+9+5] + [0] * 6
+    quarter_sum_regular = [0] * (12*4+1) + quarter_sum[12*4+1:12*4+9] + [0] * 12
+    quarter_sum_errors = [0] * (12*4 + 9) + quarter_sum[12*4+9:12*4+9+5] + [0] * 7
     LTS_quarter = [0] * (15*4+2) + LTS_quarter[15*4+2:]
 
-    print(quarter_sum_errors)
+    #print(quarter_sum_errors)
 
     cut = 12*4+1
     n = len(quarter_sum) - cut
     x = range(quarter_num-cut)
     width = 1/2
     
-    print(len(LTS_quarter))
+    #print(len(LTS_quarter))
 
     bar1 = plt.bar(x, quarter_sum_regular[cut:], width, color='darkblue', label='regular', edgecolor='black')
     bar12 = plt.bar(x, quarter_sum_errors[cut:], width, color='darkorange', label='regular*', edgecolor='black')
     bar2 = plt.bar(x, LTS_quarter[cut:], width, color='darkred', label ='long-term', edgecolor='black')
 
-    #bar1 = plt.bar(x[:-48], before[48:], width, color='blue', label='regular support')
-    #bar2 = plt.bar(x[:-48], after[48:], width, color='red', label='long-term support')
     plt.legend(handles=[bar1, bar12, bar2])
     
     plt.xticks(np.arange(0,n),quartersx[cut:], rotation="vertical")
@@ -1132,9 +1315,10 @@ try:
     action = sys.argv[1]
 except IndexError:
     print('No argument given')
-    aptsec_help()
-    sys.exit(0)
-    action = ''
+    action='update'
+    #aptsec_help()
+    #sys.exit(0)
+    #action = ''
 
 
 client = MongoClient()
@@ -1160,41 +1344,34 @@ state['vendor'] = 'debian'
 #    os.makedirs(d)
 
 if action == 'update':
-    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps) = load_DBs()
+    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, pkg_with_cvss) = load_DBs()
 #    loadsha1lists()
     aptsec_update(state,config, dsatable, client, src2dsa, dsa2cve, src2month, cvetable, pkg_with_cvss)
 #    save_sha1lists()
-#    getslocs(src2dsa, src2sloccount)
+#    getslocs(src2month, src2sloccount)
 #    getpop(src2dsa, src2pop)
 #    getdeps(src2dsa, src2deps)
-    save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum)
+    save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum, pkg_with_cvss)
     save_state(state)
+#    stats.test(src2month, src2pop, src2sloccount)
 #    lstm.predict(src2month, src2sloccount, src2pop, src2deps)
+    pred.predict(src2month, 0)
 #    print(pkg_with_cvss['linux'])
     
     low = []
     med = []
     high = []
     
-    for item in pkg_with_cvss['firefox-esr']:
-        low.append(item[0])
-        med.append(item[1])
-        high.append(item[2])
-
-#    plt.plot(low, color = 'green')
-#    plt.plot(med, color = 'orange')
-#    plt.plot(high, color = 'red')
-#    plt.show()
-
 elif action == 'status':
-    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps) = load_DBs()
+    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, pkg_with_cvss) = load_DBs()
     aptsec_status(sys.argv[2])
 elif action == 'show':
-    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps) = load_DBs()
-    src2sum = plot_all(src2month)
-    save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum)
+    (dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, pkg_with_cvss) = load_DBs()
+    src2sum = plot_all(src2month, src2sloccount, pkg_with_cvss)
+    save_DBs(dsatable, src2dsa, dsa2cve, cvetable, src2month, src2sloccount, src2pop, src2deps, src2sum, pkg_with_cvss)
 else:
     aptsec_help()
 
 
+
 save_state(state)

+ 773 - 0
prediction.py

@@ -0,0 +1,773 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas
+import math
+import sys
+from keras import backend as K
+from keras.models import Sequential
+from keras.layers import LSTM
+from keras.layers import GRU
+from keras import optimizers as opt
+from keras.layers import Dense
+from keras.layers import Activation, Dropout
+from keras.models import load_model
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.metrics import mean_squared_error
+import copy
+import random
+import arimaPred as arim
+import paper_plots as carlosplt
+
+
+np.random.seed(7)
+EPSILON = 10 ** -12
+
+def moving_average(a, n) :
+    ## code from Jaime
+    ret = np.cumsum(a, dtype=float)
+    ret[n:] = ret[n:] - ret[:-n]
+    ret = np.concatenate((np.zeros(n-1), ret[n-1:]))
+    return ret / n
+
+
+# convert an array of values into a dataset matrix
+# ATTENTION: THIS FUNCTION CHANGES SIZE OF INPUT
+def create_dataset(original_dataset, dataset, num_steps, look_back=1, feat_num=1, extra_features=[]):
+    dataX, dataY = [], []
+    print('Making training length :', len(original_dataset), ' ; ', len(dataset))
+    for i in range(len(dataset)-look_back-num_steps +1):
+                a = []
+                for j in range(i, i+look_back):
+                    if feat_num>1:
+                        a.append([original_dataset[j]]+extra_features.tolist())
+                    else:
+                        a.append([original_dataset[j]])
+                dataX.append(a)
+                mean = 0
+                for j in range(num_steps):
+                    mean += original_dataset[i+look_back+j]
+
+
+                dataY.append(mean/num_steps)
+    return np.array(dataX), np.array(dataY)
+
+def test_model(pkg_name, original_dataset, dataset, scaler, num_steps, smoothing, batch_num, look_back, test_size, mixed, feat_num=1, extra_features=[]):
+    ## mixed is a boolean. When True, the model trained on all the packages is used. When false, each package has its own model.
+    
+    totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_features)
+    K.clear_session()
+
+    if mixed:
+        new_model = load_model('./models/all_packages'+str(num_steps) + '.h5')
+    else:
+        new_model = load_model('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
+    
+    new_model.reset_states()
+    print(len(totalX))
+    totalPredict = new_model.predict(totalX[len(totalX)-batch_num:], batch_size = batch_num)
+    del new_model
+    
+    #scaler = scalers[pkg_name]
+    totalPredict = scaler.inverse_transform(totalPredict)
+    totalPredict = totalPredict.flatten()
+    total_LSTM = [0] * (len(dataset)-len(totalPredict)) + totalPredict.tolist()
+    #print(len(total_LSTM))
+    #print(total_LSTM)
+
+    #totalY = totalY[18:]
+    totalY = totalY.reshape(-1,1)
+    totalY = scaler.inverse_transform(totalY)
+    totalY = totalY.flatten()
+
+    temp = dataset.reshape(-1,1)
+    temp = scaler.inverse_transform(temp)
+    temp = temp.flatten()
+    
+    if (math.fabs(totalY[-1]-temp[-1])>EPSILON):
+        print("Possible fault!!", totalY[-1], temp[-1])
+
+
+    return total_LSTM[:]
+
+def predict_stationary(original, dataset, num_steps):
+    prediction = dict()
+    for pkg in dataset:
+        data = dataset[pkg]
+        pred = []
+        i = 0
+        for month in data:
+            if i<num_steps:
+                pred.append(0)
+            else:
+                pred.append(data[i-num_steps])
+            i+=1
+        prediction[pkg] = pred[:]
+    return prediction
+
+def predict_average(original, dataset, num_steps):
+    prediction = dict()
+    for pkg in dataset:
+        o_data = dataset[pkg]
+        average_list = []
+        i = 0
+        j = i
+        flag = True
+        for month in o_data:
+            if(month==0 and flag):
+                average_list.append(0)
+                i+=1
+                j=i
+            else:
+                flag = False
+                if(len(o_data[i:j])<=num_steps):
+                    average_list.append(0)
+                else:
+                    # average up until before the time
+                    average = sum(o_data[i:j-num_steps])/len(o_data[i:j-num_steps])
+                    average_list.append(average)
+                j += 1
+        prediction[pkg]=average_list[:]
+    return prediction
+
+def predict_Waverage(original, dataset, Lamd, num_steps):
+    prediction = dict()
+    for pkg in dataset:
+        o_data = dataset[pkg]
+        waverage_list = []
+        i = 0
+        j = i
+        flag = True
+        for month in o_data:
+            if(month == 0 and flag):
+                waverage_list.append(0)
+                i += 1
+                j = i
+            else:
+                flag = False
+                if(len(o_data[i:j])<=num_steps):
+                    waverage_list.append(0)
+                else:
+                    w_average = 0
+                    weights = 0
+                    jj = 0
+                    local = o_data[i:j-num_steps]
+                    for k in local:
+                        w_average += k * math.exp(-(len(local) - jj - 1)/Lamd)
+                        weights +=  math.exp(-(len(local) - jj - 1)/Lamd)
+                        jj += 1
+                    if (weights==0):
+                        w_average = 0
+                    else:
+                        w_average = w_average/weights
+                    waverage_list.append(w_average)
+                j += 1
+            prediction[pkg] = waverage_list[:]
+    return prediction
+
+def train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler):
+
+    model.set_weights(Wsave)
+    ## Set the initial weights - remove if we want one model for all the packages
+    train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
+    train, test = data[0:train_size], data[train_size:len(data)]
+    if (len(original_data) != len(data)):
+        return(1)
+    print('Training and test size in months: ', len(train), ' ; ', len(test))
+
+    trainX, trainY = create_dataset(train_original, train, num_steps, look_back)
+    testX, testY = create_dataset(test_original, test, num_steps, look_back)
+    
+    # reshape input to be [samples, time steps, features]
+    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
+    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
+
+    print(len(trainX), len(testX))
+    print(len(trainY), len(testY))
+    
+    trainY.reshape(-1,1)
+    testY.reshape(-1,1)
+    
+    fig = plt.gcf()
+    fig.clf()
+    fig.show()
+    fig.canvas.draw()
+
+    training_steps = 400*25
+    trainerror = []
+    valerror=[]
+    
+    for j in range (training_steps):
+        model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
+        model.reset_states()
+        if(j%25==0):
+            calc_errors(model, batch_num, original_data, data, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, [])
+            plt.plot(trainerror, color='blue')
+            plt.plot(valerror, color='red')
+            plt.title(pkg_name)
+            fig.canvas.draw()
+    try:
+        model.save('./models/' + pkg_name + '-' + str(num_steps) + 'smoothing' + '.h5')
+        #model.save('./models/all_packages'+str(num_steps) + '.h5')
+        del model
+    except OSError:
+        model.save('./models/unknown-' + str(num_steps) + '.h5')
+        del model
+
+def train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg_name, scaler, extra_features):
+
+    train_original, test_original = original_data[0:train_size], original_data[train_size:len(original_data)]
+    train, test = data[0:train_size], data[train_size:len(data)]
+    if (len(original_data) != len(data)):
+        return(1)
+    print('Training and test size in months: ', len(train), ' ; ', len(test))
+
+    trainX, trainY = create_dataset(train_original, train, num_steps, look_back, feat_num, extra_features)
+    testX, testY = create_dataset(test_original, test, num_steps, look_back, feat_num, extra_features)
+    
+    # reshape input to be [samples, time steps, features]
+    trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], feat_num))
+    testX = np.reshape(testX, (testX.shape[0], testX.shape[1], feat_num))
+
+    trainY.reshape(-1,1)
+    testY.reshape(-1,1)
+    
+    trainerror = []
+    valerror = []
+    
+    model.fit(trainX, trainY, epochs=1, batch_size=len(trainX), verbose=2, shuffle=False)
+    model.reset_states()
+
+    return 0
+
+def calc_errors(model, batch_num, original_dataset, dataset, num_steps, look_back, test_size, pkg_name, trainerror, valerror, scaler, feat_num, extra_feature):
+
+    totalX, totalY = create_dataset(original_dataset, dataset, num_steps, look_back, feat_num, extra_feature)
+
+    totalX = totalX[len(totalX)-batch_num:]
+    totalY = totalY[len(totalY)-batch_num:]
+
+    model.reset_states()
+    totalPredict = model.predict(totalX, batch_size = batch_num)
+
+    totalPredict = scaler.inverse_transform(totalPredict)
+    totalPredict = totalPredict.flatten()
+    
+    totalY = totalY.reshape(-1,1)
+    totalY = scaler.inverse_transform(totalY)
+    totalY = totalY.flatten()
+    
+    trainerror.append(mean_squared_error(totalPredict[50:-test_size], totalY[50:-test_size]))
+    valerror.append(mean_squared_error(totalPredict[-test_size:], totalY[-test_size:]))
+
+    return 0
+
+
+def calc_errors_all(model, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features):
+    temp1 = 0
+    temp2 = 0
+    train_temp = []
+    test_temp = []
+
+    for pkg in data_in:
+        scaler = scalers[pkg]
+        calc_errors(model, batch_num, original_data_in[pkg], data_in[pkg], num_steps, look_back, test_size, pkg, train_temp, test_temp, scaler, feat_num, extra_features[pkg]) ** 2
+
+    for errors in train_temp:
+        temp1 += errors ** 2
+    
+    for errors in test_temp:
+        temp2 += errors ** 2
+
+    trainerror.append(math.sqrt(temp1)/len(data_in))
+    valerror.append(math.sqrt(temp2)/len(data_in))
+
+    return 0
+
+def predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train):
+    # Model parameters
+    # Do testing?
+    do_test = True
+    # Number of different models to train for
+    models_num = 5
+    # Look back steps
+    look_back = 9
+    # Number of lstm neurons
+    num_neurons = look_back
+    num_neurons2 = look_back
+    # in case we want to add more features in the future
+    feat_num = 1
+    
+    data_size = len(dataset[first])
+    print(data_size)
+    train_size = int(data_size - test_size)
+    batch_num = train_size - num_steps - look_back + 1
+    print("batch_num: ", batch_num)
+
+    ## Create the NN with Keras
+    model = Sequential()
+    #model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
+    model.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.5, stateful=True))
+    #model.add(LSTM(num_neurons2, activation ='relu', dropout=0.4, stateful=True))
+    model.add(Dense(1))
+    Adam = opt.Adam(lr=0.001)
+    model.compile(loss='mean_squared_error', optimizer=Adam)
+    Wsave = model.get_weights()
+
+
+    pred_LSTM = dict()
+    data_in = dict()
+    original_data_in = dict()
+    scalers = dict()
+    
+    ## Let's start preparing our data
+    for pkg in dataset:
+        data = dataset[pkg]
+        data = data.reshape(-1,1)
+        scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
+        scalers[pkg].fit(data)
+        ## We use scaler for each package seperately
+        data = scalers[pkg].transform(data)
+        data = data.flatten()
+        original_data = original[pkg]
+        original_data = original_data.reshape(-1,1)
+        original_data = scalers[pkg].transform(original_data)
+        original_data = original_data.flatten()
+        data_in[pkg] = data
+        original_data_in[pkg] = original_data
+
+        ## Compute the total number of reported vulnerabilities in case we need it later
+        total_sum = sum(original[pkg])
+
+    ## Let's start with the training - if we want to train ofc...
+    if do_train:
+        ## Just a test to have one LSTM for all packages
+        for i in range(1):
+            for pkg in dataset:
+                ## random selection for mixed training
+                ## CHANGE for dedicated models
+                #pkg = random.choice(list(dataset.keys()))
+                data = data_in[pkg]
+                original_data = original_data_in[pkg]
+                train_LSTM(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model, Wsave, feat_num, look_back, pkg, scalers[pkg])
+        
+    if do_test:
+        for pkg in dataset:
+            data = data_in[pkg]
+            original_data = original_data_in[pkg]
+            pred_LSTM[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, False)
+    
+    return pred_LSTM
+
+
+def predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train):
+    # Model parameters
+    do_test = True
+    # Look back steps
+    look_back = 9
+    feat_num = 1+len(dataset)
+    # Number of lstm neurons
+    num_neurons = feat_num + 10
+    num_neurons2 = look_back
+    # in case we want to add more features in the future
+    extra_features = dict()
+    ## Training steps
+    training_steps = 600*25
+
+    ## Use one extra feature to signal package identity
+    i = 0
+    for pkg in dataset:
+        extra_features[pkg] = np.asarray([0]*i + [1] + [0]*(len(dataset)-i-1))
+        extra_features[pkg].reshape(-1,1)
+
+
+
+
+    
+    data_size = len(dataset[first])
+    print(data_size)
+    train_size = int(data_size - test_size)
+    batch_num = train_size - num_steps - look_back + 1
+    print("batch_num: ", batch_num)
+
+    ## Create the NN with Keras
+    model2 = Sequential()
+    #model2.add(Dense(units=num_neurons, activation='relu', batch_input_shape=(batch_num, look_back,feat_num)))
+    model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.4, stateful=True, return_sequences=True))
+    #model2.add(LSTM(num_neurons, batch_input_shape = (batch_num, look_back, feat_num) , activation ='relu', dropout=0.1, stateful=True))
+    model2.add(LSTM(num_neurons, activation ='relu', dropout=0.1, stateful=True))
+    #model2.add(Dense(num_neurons))
+    model2.add(Dense(1))
+    Adam = opt.Adam(lr=0.001)
+    model2.compile(loss='mean_squared_error', optimizer=Adam)
+    Wsave = model2.get_weights()
+
+
+    pred_LSTM_all = dict()
+    data_in = dict()
+    original_data_in = dict()
+    scalers = dict()
+    
+    ## Let's start preparing our data
+    for pkg in dataset:
+        data = dataset[pkg]
+        data = data.reshape(-1,1)
+        scalers[pkg] = MinMaxScaler(feature_range=(0, 1))
+        scalers[pkg].fit(data)
+        ## We use scaler for each package seperately
+        data = scalers[pkg].transform(data)
+        data = data.flatten()
+        original_data = original[pkg]
+        original_data = original_data.reshape(-1,1)
+        original_data = scalers[pkg].transform(original_data)
+        original_data = original_data.flatten()
+        data_in[pkg] = data
+        original_data_in[pkg] = original_data
+
+        ## Compute the total number of reported vulnerabilities in case we need it later
+        total_sum = sum(original[pkg])
+
+    ## Let's start with the training - if we want to train ofc...
+    if do_train:
+        ## Just a test to have one LSTM for all packages
+        fig = plt.gcf()
+        fig.clf()
+        fig.show()
+        fig.canvas.draw()
+        trainerror = []
+        valerror=[]
+        for i in range(training_steps):
+            ## random selection for mixed training
+            pkg = random.choice(list(dataset.keys()))
+            data = data_in[pkg]
+            original_data = original_data_in[pkg]
+            train_LSTM_all(original_data, data, num_steps, data_size, train_size, test_size, batch_num, model2, Wsave, feat_num, look_back, pkg, scalers[pkg], extra_features[pkg])
+            if(i%25==0):
+                calc_errors_all(model2, batch_num, original_data_in, data_in, num_steps, look_back, test_size, trainerror, valerror, scalers, feat_num, extra_features)
+                plt.plot(trainerror, color='blue')
+                plt.plot(valerror, color='red')
+                plt.title('Mixed')
+                fig.canvas.draw()
+        try:
+            model2.save('./models/all_packages'+str(num_steps) + '.h5')
+            del model2
+        except OSError:
+            model2.save('./models/unknown-' + str(num_steps) + '.h5')
+        
+    if do_test:
+        for pkg in dataset:
+            data = data_in[pkg]
+            original_data = original_data_in[pkg]
+            pred_LSTM_all[pkg] = test_model(pkg, original_data, data, scalers[pkg], num_steps, smoothing, batch_num, look_back, test_size, True, feat_num, extra_features[pkg])
+
+    return pred_LSTM_all
+    
+
+def load_dataset_smoothed(src2month, pkglist, smoothing):
+    temp = dict()
+    original = dict()
+    for pkg in pkglist:
+        temp1 = np.asarray(src2month[pkg])
+        temp2 = temp1
+        # Smooth the time-series with a moving average
+        temp1 = moving_average(temp1, n=smoothing)
+        temp1 = temp1[smoothing:]
+        temp2 = temp2[smoothing:]
+        ## Cut off leading part
+        temp[pkg] = temp1
+        original[pkg] = temp2
+    return (original, temp)
+
+
+def print_all(data, pkglist):
+    plt.figure(1)
+    i = 1
+    for pkg in pkglist:
+        plt.subplot(2,5,i)
+        plt.plot(data[pkg], label = pkg)
+        plt.legend()
+        i += 1
+    plt.show()
+
+def print_pred(data, pred):
+    plt.plot(data, color='blue', label='reality')
+    plt.plot(pred, color='red', label='model')
+    plt.legend()
+    plt.show()
+
+def print_all_pred(data, pred, pkglist, test_size):
+    carlosplt.pre_paper_plot(True)
+    plt.figure(1)
+    i = 1
+
+    ## Build x axis
+    quartersx = []
+
+    for y in range(2,19):
+        start = 1
+        end = 5
+        if y == 2:
+            start = 2
+        elif y == 19:
+            end = 2
+        for j in range(start,end):
+            if j==1 and y%2==1:
+                quartersx.append('\''+str(y).zfill(2))
+            else:
+                quartersx.append(' ')
+
+    for pkg in pkglist:
+        data_local = data[pkg]
+        data_local = data_local.flatten()
+        ax = plt.subplot(1,5,i)
+        data_train = data_local[:-test_size]
+        data_test = data_local[-test_size:]
+        if pkg == 'firefox-esr':
+            pkg_label = 'firefox'
+        elif pkg == 'chromium-browser':
+            pkg_label = 'chromium'
+        elif pkg == 'openjdk-8':
+            pkg_label = 'openjdk'
+        else:
+            pkg_label = pkg
+        x_axis_train = []
+        x_axis_test = []
+        for j in range(len(data_train)):
+            x_axis_train.append(j)
+        for j in range(len(data_test)):
+            x_axis_test.append(j+len(data_train))
+        x_axis_all = x_axis_train + x_axis_test
+        
+        if i==5:
+            train=ax.plot(x_axis_train, data_train, color = 'grey', label='real-tr')
+            test=ax.plot(x_axis_test, data_test, color = 'coral', label = 'real-te')
+            model=ax.plot(x_axis_all,pred[pkg], color ='blue', label='model')
+            ax.legend()
+        else:
+            ax.plot(x_axis_train, data_train, color = 'grey')
+            ax.plot(x_axis_test, data_test, color = 'coral')
+            ax.plot(x_axis_all,pred[pkg], color ='blue')
+        ax.spines['right'].set_visible(False)
+        ax.spines['top'].set_visible(False)
+        ax.set_title(pkg_label)
+        plt.xticks(np.arange(1,len(pred[pkg]),3.0)+1,quartersx, rotation="vertical")
+        i += 1
+    carlosplt.post_paper_plot(True,True,True)
+    plt.show()
+
+def normalize_errors(errors_list, pkglist, dataset, test_size):
+    for pkg in errors_list:
+        maximum = np.amax(dataset[pkg][-test_size:])
+        minimum = np.amin(dataset[pkg][-test_size:])
+        norm = maximum-minimum
+        if norm<0.1:
+            norm = 0.1
+        for d in [errors_list]:
+            d[pkg] = d[pkg]/norm
+
+def compute_errors_perpackage(prediction, dataset, test_size):
+    temp_errors = dict()
+    for pkg in prediction:
+        temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][-test_size:], dataset[pkg][-test_size:]))
+    return temp_errors
+
+def compute_errors_train_perpackage(prediction, dataset, test_size):
+    temp_errors = dict()
+    for pkg in prediction:
+        temp_errors[pkg] = math.sqrt(mean_squared_error(prediction[pkg][:-test_size], dataset[pkg][:-test_size]))
+    return temp_errors
+
+def find_best_Lamd(original, dataset, num_steps, test_size):
+    pred_Waverage_temp = dict()
+    pred_Waverage = dict()
+    errors = dict()
+    best_errors = dict()
+    best_lamdas = dict()
+    dataset_temp = dict()
+
+    pred_Waverage = predict_Waverage(original, dataset, 1, num_steps)
+    errors[1] = compute_errors_train_perpackage(pred_Waverage, dataset, test_size)
+    best_errors = errors[1]
+    
+    for pkg in dataset:
+        best_lamdas[pkg] = 1
+    
+    for Lamd in range(1,100):
+        pred_Waverage_temp = predict_Waverage(original, dataset, Lamd, num_steps)
+        ## To compute best lamda
+        errors[Lamd] = compute_errors_train_perpackage(pred_Waverage_temp, dataset, test_size)
+        for pkg in pred_Waverage_temp:
+            print(errors[Lamd][pkg])
+            if errors[Lamd][pkg] < best_errors[pkg]:
+                best_errors[pkg] = errors[Lamd][pkg]
+                best_lamdas[pkg] = Lamd
+
+    for pkg in dataset:
+        pred_Waverage[pkg] = predict_Waverage(original, dataset, best_lamdas[pkg], num_steps)[pkg]
+
+    print(best_lamdas)
+
+
+    return pred_Waverage
+
+def do_training_errors(predictions_list, errors, dataset, test_size):
+    ## First for each package
+
+    for method in predictions_list:
+        temp_dict = dict()
+        try:
+            temp_dict = compute_errors_train_perpackage(predictions_list[method], dataset, test_size)
+            errors[method] = temp_dict
+        except:
+            print('Predictions missing')
+
+
+def do_testing_errors(predictions_list, errors, dataset, test_size):
+    
+    for method in predictions_list:
+        temp_dict = dict()
+        try:
+            temp_dict = compute_errors_perpackage(predictions_list[method], dataset, test_size)
+            errors[method] = temp_dict
+        except:
+            print('Predictions missing')
+
+def calculate_rmse(errors):
+    temp = 0
+    for pkg in errors:
+        temp += errors[pkg]*errors[pkg]
+
+    temp = math.sqrt(temp)/len(errors)
+
+    return temp
+
+def calculate_mean(errors):
+    temp = 0
+    for pkg in errors:
+        temp += errors[pkg]
+
+    temp = temp/len(errors)
+
+    return temp
+
+def print_summary(training_errors, testing_errors):
+    print('#'*80)
+    print('***** REPORT *****')
+    for method in training_errors:
+        print(method)
+        print('Training Errors rmse: ', '%.3f' % training_errors[method]['rmse'])
+        print('Testing Errors rmse: ', '%.3f' % testing_errors[method]['rmse'])
+        print('Training Errors mean: ', '%.3f' %training_errors[method]['mean'])
+        print('Testing Errors mean: ', '%.3f' %testing_errors[method]['mean'])
+        print('#'*80)
+    
+    return 0
+
+def predict(src2month, k):
+
+    pkglist=[]
+
+    for pkg in src2month:
+        if (sum(src2month[pkg])>50):
+            pkglist.append(pkg)
+
+    
+    #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'wireshark', 'openjdk-8', 'mysql-transitional', 'php7.0', 'imagemagick', 'tcpdump']
+    #pkglist = ['linux', 'firefox-esr', 'chromium-browser', 'icedove', 'openjdk-8']
+    pkglist = ['icedove']
+    first = pkglist[0]
+    #pkglist = [first]
+    # Number of months in the future
+    num_steps = 9
+    smoothing = num_steps
+    # Test dataset size in months
+    test_size = 18
+    real_test_size = 18
+    do_train = False
+
+    dataset = dict()
+    # Cut out end of 2018
+    for pkg in pkglist:
+        ## This is for training
+        if do_train:
+            src2month[pkg] = src2month[pkg][:-9-real_test_size]
+        ## This is for experiments
+        else:
+            src2month[pkg] = src2month[pkg][test_size:-9]
+
+
+    (original,dataset) = load_dataset_smoothed(src2month, pkglist, smoothing)
+    # Each point of dataset is the mean of the same point and the previous smoothing-1 of the original
+    num_packages = len(pkglist)
+
+    # Print all smoothed time-series
+    #print_all(dataset, pkglist)
+    
+    ## Make simple predictions (stationary, average, waverage)
+    predictions_list = dict()
+    predictions_list['stationary'] = predict_stationary(original, dataset, num_steps)
+    predictions_list['average'] = predict_average(original, dataset, num_steps)
+    predictions_list['Waverage'] = find_best_Lamd(original, dataset, num_steps, test_size)
+    #predictions_list['LSTM'] = predict_LSTM(original, dataset, num_steps, test_size, smoothing, first, do_train)
+    #predictions_list['LSTM_all'] = predict_LSTM_all(original, dataset, num_steps, test_size, smoothing, first, do_train)
+    
+    #print_all_pred(dataset, predictions_list['LSTM'], pkglist, test_size)
+    #pkglist_new=['linux','firefox-esr', 'chromium-browser', 'icedove']
+    pkglist_new = pkglist
+    print_all_pred(dataset, predictions_list['Waverage'], pkglist_new, test_size)
+
+    training_errors = dict()
+    ## Dictionary of training errors e.g. training_errors['LSTM']['linux'] = XXX
+    testing_errors = dict()
+    ## Same for testing errors
+
+    new_predictions_list = dict()
+
+    ## For which packages to compute the error?
+    for method in predictions_list:
+        new_predictions_list[method] = dict()
+        for pkg in predictions_list[method]:
+            if (sum(src2month[pkg])>200):
+                new_predictions_list[method][pkg] = predictions_list[method][pkg]
+
+
+    print(new_predictions_list)
+    
+    do_training_errors(new_predictions_list, training_errors, dataset, test_size)
+    do_testing_errors(new_predictions_list, testing_errors, dataset, test_size)
+
+    ## Now among the packages again rmse. But first we normalize. Choose whether we want this or not
+
+    for method in training_errors:
+        normalize_errors(training_errors[method], pkglist, dataset, test_size)
+        normalize_errors(testing_errors[method], pkglist, dataset, test_size)
+
+    for pkg in training_errors['average']:
+        print('#'*80)
+        print(pkg)
+        print('Training errors:')
+        temp_list = []
+        for method in training_errors:
+            string = method + ': ' + str(training_errors[method][pkg]) + ' , '
+            temp_list.append(string)
+
+        print(temp_list)
+        
+        temp_list = []
+        for method in training_errors:
+            string = method + ': ' + str(testing_errors[method][pkg])
+            temp_list.append(string)
+
+        print('Testing errors:')
+        print(temp_list)
+
+    ## Now it is time for the rmse among the packages
+    for method in testing_errors:
+        testing_errors[method]['rmse'] = calculate_rmse(testing_errors[method])
+        testing_errors[method]['mean'] = calculate_mean(testing_errors[method])
+        training_errors[method]['rmse'] = calculate_rmse(training_errors[method])
+        training_errors[method]['mean'] = calculate_mean(training_errors[method])
+
+    
+    print_summary(training_errors, testing_errors)
+    
+    return