Browse Source

Degree Statistics Rework

Marcel 6 years ago
parent
commit
bdba04b670

+ 117 - 205
code/ID2TLib/Statistics.py

@@ -546,106 +546,6 @@ class Statistics:
         else:
             return None
 
-    def get_in_degree(self):
-        """
-        determines the in-degree for each ipAddress, i.e. for every IP the count of ipAddresses it has received packets from
-        :return: a list, each entry consists of one IPAddress and its associated in-degree
-        """
-
-        in_degree_raw = self.stats_db._process_user_defined_query(
-                "SELECT ipAddressA, Count(DISTINCT ipAddressB) FROM ip_ports JOIN conv_statistics_stateless ON ipAddress = ipAddressA WHERE portDirection=\'in\' AND portNumber = portA GROUP BY ipAddress " +
-                "UNION " +
-                "SELECT ipAddressB, Count(DISTINCT ipAddressA) FROM ip_ports JOIN conv_statistics_stateless ON ipAddress = ipAddressB WHERE portDirection=\'in\' AND portNumber = portB GROUP BY ipAddress")
-
-        #Because of the structure of the database, there could be 2 entries for the same IP Address, therefore accumulate their sums
-        in_degree = self.filter_multiples(in_degree_raw)
-
-        return in_degree
-
-    def get_out_degree(self):
-        """
-        determines the out-degree for each ipAddress, i.e. for every IP the count of ipAddresses it has sent packets to
-        :return: a list, each entry consists of one IPAddress and its associated out-degree
-        """
-        
-        out_degree_raw = self.stats_db._process_user_defined_query(
-                "SELECT ipAddressA, Count(DISTINCT ipAddressB) FROM ip_ports JOIN conv_statistics_stateless ON ipAddress = ipAddressA WHERE portDirection=\'out\' AND portNumber = portA GROUP BY ipAddress " +
-                "UNION " +
-                "SELECT ipAddressB, Count(DISTINCT ipAddressA) FROM ip_ports JOIN conv_statistics_stateless ON ipAddress = ipAddressB WHERE portDirection=\'out\' AND portNumber = portB GROUP BY ipAddress")
-
-        #Because of the structure of the database, there could be 2 entries for the same IP Address, therefore accumulate their sums
-        out_degree = self.filter_multiples(out_degree_raw)
-
-        return out_degree
-
-    def get_overall_degree(self):
-        """
-        determines the overall-degree for each ipAddress, i.e. for every IP the count of ipAddresses it has sent packets to
-        :return: a list, each entry consists of one IPAddress and its associated overall-degree
-        """
-
-        out_degrees = self.get_out_degree()
-        in_degrees = self.get_in_degree()
-        overall_degrees = []
-        processed = {} # Dict, taking an IP Address and returning True, if the IP has already been processed and added to overall_degree
-
-        # initialize values of the dict for in_degrees, this is important for error-free checking whether there are not processed IPs
-        # for out_degrees this can be done without an additional loop
-        for inD in in_degrees:
-            processed[inD[0]] = False
-
-        for outD in out_degrees:
-            ip_out = outD[0]
-            processed[ip_out] = False
-
-            # add the sum of degrees for all IPs that appear in both lists
-            for inD in in_degrees:
-                ip_in = inD[0]
-                if ip_out == ip_in:
-                    # same IPAddress -> append sum of degrees
-                    overall_degrees.append((ip_out, outD[1] + inD[1]))
-                    processed[ip_out] = True
-
-            if not processed[ip_out]:
-                # if IP only appears in out_degree list -> just append the value
-                overall_degrees.append(outD)
-                processed[outD[0]] = True
-        
-        # add remaining IPs, which did not appear in out_degree
-        for inD in in_degrees:
-            if not processed[inD[0]]:
-                overall_degrees.append(inD)
-
-        return overall_degrees
-
-    def filter_multiples(self, entries):
-        """
-        helper function, for get_out_degree and get_in_degree
-        filters the given list for duplicate IpAddresses and, if duplciates are present, accumulates their values
-
-        :param entries: list, each entry consists of an ipAddress and a numeric value
-        :return: a filtered list, without duplicate ipAddresses
-        """
-
-        filtered_entries = []
-        done = []
-        for p1 in entries:
-            added = False
-            if p1 in done:
-                continue
-            for p2 in entries:
-                if p1[0] == p2[0] and p1 != p2:
-                    filtered_entries.append((p1[0], p1[1] + p2[1]))
-                    done.append(p1)
-                    done.append(p2)
-                    added = True
-                    break
-
-            if not added:
-                filtered_entries.append(p1)
-
-        return filtered_entries
-
     def get_avg_delay_local_ext(self):
         """
         Calculates the average delay of a packet for external and local communication, based on the tcp handshakes
@@ -692,6 +592,25 @@ class Statistics:
             avg_delay_local = 0.06
         return avg_delay_local, avg_delay_external
 
+    def get_filtered_degree(self, degree_type: str):
+        """
+        gets the desired type of degree statistics and filters IPs with degree value zero
+
+        :param degree_type: the desired type of degrees, one of the following: inDegree, outDegree, overallDegree
+        :return: the filtered degrees
+        """
+
+        degrees_raw = self.stats_db._process_user_defined_query(
+                "SELECT ipAddress, %s FROM ip_degrees" % degree_type)
+
+        degrees = []
+        if(degrees_raw):
+            for deg in degrees_raw:
+                if int(deg[1]) > 0:
+                    degrees.append(deg)
+        
+        return degrees
+
     def get_statistics_database(self):
         """
         :return: A reference to the statistics database object
@@ -1083,49 +1002,47 @@ class Statistics:
             plt.gcf().clear()
 
             # retrieve data
-            in_degree = self.get_in_degree()
+            in_degree = self.get_filtered_degree("inDegree")
 
-            if(in_degree):
-                graphx, graphy = [], []
-                for entry in in_degree:
-                    # degree values
-                    graphx.append(entry[1])
-                    # IP labels
-                    graphy.append(entry[0])
+            graphx, graphy = [], []
+            for entry in in_degree:
+                # degree values
+                graphx.append(entry[1])
+                # IP labels
+                graphy.append(entry[0])
 
-                # set labels
-                plt.title("Indegree per IP Address")
-                plt.ylabel('IpAddress')
-                plt.xlabel('Indegree')
+            # set labels
+            plt.title("Indegree per IP Address")
+            plt.ylabel('IpAddress')
+            plt.xlabel('Indegree')
 
-                #set width of the bars
-                width = 0.3
+            #set width of the bars
+            width = 0.3
 
-                # set scalings
-                plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
+            # set scalings
+            plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
 
-                #set limits of the axis
-                plt.ylim([0, len(graphy)])
-                plt.xlim([0, max(graphx) + 10])
+            #set limits of the axis
+            plt.ylim([0, len(graphy)])
+            plt.xlim([0, max(graphx) + 10])
 
-                # display numbers at each bar
-                for i, v in enumerate(graphx):
-                    plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
+            # display numbers at each bar
+            for i, v in enumerate(graphx):
+                plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
 
-                # display grid for better visuals
-                plt.grid(True)
+            # display grid for better visuals
+            plt.grid(True)
 
-                # plot the bar
-                labels = graphy
-                graphy = list(range(len(graphx)))
-                plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
-                plt.yticks(graphy, labels)
-                out = self.pcap_filepath.replace('.pcap', '_in_degree' + file_ending)
-                plt.tight_layout()
-                plt.savefig(out,dpi=500)
-                return out
-            else:
-                print("Error: No statistics Information for plotting out-degrees found")
+            # plot the bar
+            labels = graphy
+            graphy = list(range(len(graphx)))
+            plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
+            plt.yticks(graphy, labels)
+            out = self.pcap_filepath.replace('.pcap', '_in_degree' + file_ending)
+            plt.tight_layout()
+            plt.savefig(out,dpi=500)
+
+            return out
 
         def plot_out_degree(file_ending: str):
             """
@@ -1138,49 +1055,47 @@ class Statistics:
             plt.gcf().clear()
 
             # retrieve data
-            out_degree = self.get_out_degree()
+            out_degree = self.get_filtered_degree("outDegree")
 
-            if(out_degree):
-                graphx, graphy = [], []
-                for entry in out_degree:
-                    # degree values
-                    graphx.append(entry[1])
-                    # IP labels
-                    graphy.append(entry[0])
+            graphx, graphy = [], []
+            for entry in out_degree:
+                # degree values
+                graphx.append(entry[1])
+                # IP labels
+                graphy.append(entry[0])
 
-                # set labels
-                plt.title("Outdegree per IP Address")
-                plt.ylabel('IpAddress')
-                plt.xlabel('Outdegree')
+            # set labels
+            plt.title("Outdegree per IP Address")
+            plt.ylabel('IpAddress')
+            plt.xlabel('Outdegree')
 
-                #set width of the bars
-                width = 0.3
+            #set width of the bars
+            width = 0.3
 
-                # set scalings
-                plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
+            # set scalings
+            plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
 
-                #set limits of the axis
-                plt.ylim([0, len(graphy)])
-                plt.xlim([0, max(graphx) + 10])
+            #set limits of the axis
+            plt.ylim([0, len(graphy)])
+            plt.xlim([0, max(graphx) + 10])
 
-                # display numbers at each bar
-                for i, v in enumerate(graphx):
-                    plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
+            # display numbers at each bar
+            for i, v in enumerate(graphx):
+                plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
 
-                # display grid for better visuals
-                plt.grid(True)
+            # display grid for better visuals
+            plt.grid(True)
 
-                # plot the bar
-                labels = graphy
-                graphy = list(range(len(graphx)))
-                plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
-                plt.yticks(graphy, labels)
-                out = self.pcap_filepath.replace('.pcap', '_out_degree' + file_ending)
-                plt.tight_layout()
-                plt.savefig(out,dpi=500)
-                return out
-            else:
-                print("Error: No statistics Information for plotting out-degrees found")
+            # plot the bar
+            labels = graphy
+            graphy = list(range(len(graphx)))
+            plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
+            plt.yticks(graphy, labels)
+            out = self.pcap_filepath.replace('.pcap', '_out_degree' + file_ending)
+            plt.tight_layout()
+            plt.savefig(out,dpi=500)
+
+            return out
 
         def plot_overall_degree(file_ending: str):
             """
@@ -1193,49 +1108,46 @@ class Statistics:
             plt.gcf().clear()
 
             # retrieve data
-            overall_degree = self.get_overall_degree()
+            overall_degree = self.get_filtered_degree("overallDegree")
 
-            if(overall_degree):
-                graphx, graphy = [], []
-                for entry in overall_degree:
-                    # degree values
-                    graphx.append(entry[1])
-                    # IP labels
-                    graphy.append(entry[0])
+            graphx, graphy = [], []
+            for entry in overall_degree:
+                # degree values
+                graphx.append(entry[1])
+                # IP labels
+                graphy.append(entry[0])
 
-                # set labels
-                plt.title("Overalldegree per IP Address")
-                plt.ylabel('IpAddress')
-                plt.xlabel('Overalldegree')
+            # set labels
+            plt.title("Overalldegree per IP Address")
+            plt.ylabel('IpAddress')
+            plt.xlabel('Overalldegree')
 
-                #set width of the bars
-                width = 0.3
+            #set width of the bars
+            width = 0.3
 
-                # set scalings
-                plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
+            # set scalings
+            plt.figure(figsize=(int(len(graphx))/20 + 5, int(len(graphy)/5) + 5))  # these proportions just worked well
 
-                #set limits of the axis
-                plt.ylim([0, len(graphy)])
-                plt.xlim([0, max(graphx) + 10])
+            #set limits of the axis
+            plt.ylim([0, len(graphy)])
+            plt.xlim([0, max(graphx) + 10])
 
-                # display numbers at each bar
-                for i, v in enumerate(graphx):
-                    plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
+            # display numbers at each bar
+            for i, v in enumerate(graphx):
+                plt.text(v + 1, i + .1, str(v), color='blue', fontweight='bold')
 
-                # display grid for better visuals
-                plt.grid(True)
+            # display grid for better visuals
+            plt.grid(True)
 
-                # plot the bar
-                labels = graphy
-                graphy = list(range(len(graphx)))
-                plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
-                plt.yticks(graphy, labels)
-                out = self.pcap_filepath.replace('.pcap', '_overall_degree' + file_ending)
-                plt.tight_layout()
-                plt.savefig(out,dpi=500)
-                return out
-            else:
-                print("Error: No statistics Information for plotting overall-degrees found")
+            # plot the bar
+            labels = graphy
+            graphy = list(range(len(graphx)))
+            plt.barh(graphy, graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
+            plt.yticks(graphy, labels)
+            out = self.pcap_filepath.replace('.pcap', '_overall_degree' + file_ending)
+            plt.tight_layout()
+            plt.savefig(out,dpi=500)
+            return out
 
         def plot_big_comm_interval_stat(attr:str, table:str, title:str, xlabel:str, suffix:str):
             """

+ 19 - 0
code_boost/src/cxx/statistics.cpp

@@ -488,6 +488,24 @@ void statistics::addIpStat_packetSent(std::string filePath, std::string ipAddres
     ip_statistics[ipAddressReceiver].kbytes_received += (float(bytesSent) / 1024);
     ip_statistics[ipAddressReceiver].pkts_received++;
     ip_statistics[ipAddressReceiver].pkts_received_timestamp.push_back(timestamp);
+
+    // Increment Degrees for sender and receiver, if Sender sends its first packet to this receiver
+    std::vector<std::string>::iterator found_receiver = std::find(contacted_ips[ipAddressSender].begin(), contacted_ips[ipAddressSender].end(), ipAddressReceiver);
+    if(found_receiver == contacted_ips[ipAddressSender].end()){
+        // Receiver is NOT contained in the List of IPs, that the Sender has contacted, therefore this is the first packet in this direction
+        ip_statistics[ipAddressSender].out_degree++;
+        ip_statistics[ipAddressReceiver].in_degree++;
+
+        // Increment overall_degree only if this is the first packet for the connection (both directions)
+        // Therefore check, whether Receiver has contacted Sender before
+        std::vector<std::string>::iterator sender_contacted = std::find(contacted_ips[ipAddressReceiver].begin(), contacted_ips[ipAddressReceiver].end(), ipAddressSender);
+        if(sender_contacted == contacted_ips[ipAddressReceiver].end()){
+            ip_statistics[ipAddressSender].overall_degree++;
+            ip_statistics[ipAddressReceiver].overall_degree++;
+        }  
+
+        contacted_ips[ipAddressSender].push_back(ipAddressReceiver);
+    }
 }
 
 /**
@@ -714,6 +732,7 @@ void statistics::writeToDatabase(std::string database_path) {
         db.writeStatisticsIP(ip_statistics);
         db.writeStatisticsTTL(ttl_distribution);
         db.writeStatisticsIpMac(ip_mac_mapping);
+        db.writeStatisticsDegree(ip_statistics);
         db.writeStatisticsPorts(ip_ports);
         db.writeStatisticsProtocols(protocol_distribution);
         db.writeStatisticsMSS(mss_distribution);

+ 6 - 0
code_boost/src/cxx/statistics.h

@@ -150,6 +150,9 @@ struct entry_ipStat {
     float kbytes_received;
     float kbytes_sent;
     std::string ip_class;
+    int in_degree;
+    int out_degree;
+    int overall_degree;
     // Collects statstics over time interval
     std::vector<float> interval_pkt_rate;
     float max_interval_pkt_rate;
@@ -565,6 +568,9 @@ private:
     // {IP Address, Protocol, count}
     std::unordered_map<ipAddress_protocol, int> protocol_distribution;
 
+    //{IP Address, contacted IP Addresses}
+    std::unordered_map<std::string, std::vector<std::string>> contacted_ips;
+
     // {IP Address,  #received packets, #sent packets, Data received in kbytes, Data sent in kbytes}
     std::unordered_map<std::string, entry_ipStat> ip_statistics;
 

+ 33 - 0
code_boost/src/cxx/statistics_db.cpp

@@ -57,6 +57,39 @@ void statistics_db::writeStatisticsIP(std::unordered_map<std::string, entry_ipSt
     }
 }
 
+/**
+ * Writes the IP Degrees into the database.
+ * @param ipStatistics The IP statistics from class statistics. Degree Statistics are supposed to be integrated into the ip_statistics table later on,
+ *        therefore they use the same parameter. But for now they are inserted into their own table.
+ */
+void statistics_db::writeStatisticsDegree(std::unordered_map<std::string, entry_ipStat> ipStatistics){
+    try {
+        db->exec("DROP TABLE IF EXISTS ip_degrees");
+        SQLite::Transaction transaction(*db);
+        const char *createTable = "CREATE TABLE ip_degrees ( "
+                "ipAddress TEXT, "
+                "inDegree INTEGER, "
+                "outDegree INTEGER, "
+                "overallDegree INTEGER, "
+                "PRIMARY KEY(ipAddress));";
+        db->exec(createTable);
+        SQLite::Statement query(*db, "INSERT INTO ip_degrees VALUES (?, ?, ?, ?)");
+        for (auto it = ipStatistics.begin(); it != ipStatistics.end(); ++it) {
+            entry_ipStat e = it->second;
+            query.bind(1, it->first);
+            query.bind(2, e.in_degree);
+            query.bind(3, e.out_degree);
+            query.bind(4, e.overall_degree);
+            query.exec();
+            query.reset();
+        }
+        transaction.commit();
+    }
+    catch (std::exception &e) {
+        std::cout << "Exception in statistics_db: " << e.what() << std::endl;
+    }
+}
+
 /**
  * Writes the TTL distribution into the database.
  * @param ttlDistribution The TTL distribution from class statistics.

+ 2 - 0
code_boost/src/cxx/statistics_db.h

@@ -23,6 +23,8 @@ public:
      */
     void writeStatisticsIP(std::unordered_map<std::string, entry_ipStat> ipStatistics);
 
+    void writeStatisticsDegree(std::unordered_map<std::string, entry_ipStat> ipStatistics);
+
     void writeStatisticsTTL(std::unordered_map<ipAddress_ttl, int> ttlDistribution);
 
     void writeStatisticsMSS(std::unordered_map<ipAddress_mss, int> mssDistribution);