Prechádzať zdrojové kódy

Add plots/statistics

Add plots/statistics for average interval time for one connection
and overall communication time (last-first) for a connection.
Note: requires rebuild
dustin.born 7 rokov pred
rodič
commit
b7b0c8347a

+ 58 - 8
code/ID2TLib/Statistics.py

@@ -5,6 +5,7 @@ import os
 import time
 import ID2TLib.libpcapreader as pr
 import matplotlib
+import numpy
 
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -597,7 +598,7 @@ class Statistics:
                     filtered_entries.append((p1[0], p1[1] + p2[1]))
                     done.append(p1)
                     done.append(p2)
-                    print("duplicate found:", p1, " and ", p2)
+                    # print("duplicate found:", p1, " and ", p2)
                     added = True
                     break
 
@@ -1168,6 +1169,7 @@ class Statistics:
 
                 # compute plot data
                 for i, row in enumerate(result):
+                    print(row)
                     addr1, addr2 = "%s:%d" % (row[0], row[1]), "%s:%d" % (row[2], row[3])
                     # adjust the justification of strings to improve appearance
                     len_max = max(len(addr1), len(addr2))
@@ -1177,16 +1179,35 @@ class Statistics:
                     graphy.append("%s\n%s" % (addr1, addr2))
                     graphx.append(row[4])
 
-            # compute plot height in inches
-            dist_mult_height, dist_mult_width = 0.55, 0.07  # these values turned out to work well
-            plt_height, plt_width = len(graphy) * dist_mult_height, max(graphx) * dist_mult_width
-            title_distance = 1 + 0.012*52.8/plt_height  # orginally, a good title distance turned out to be 1.012 with a plot height of 52.8
 
             # have x axis and its label appear at the top (instead of bottom)
             fig, ax = plt.subplots()
             ax.xaxis.tick_top()
             ax.xaxis.set_label_position("top")
 
+            # compute plot height in inches for scaling the plot
+            dist_mult_height, dist_mult_width = 0.55, 0.07  # these values turned out to work well
+
+            # use static scale along the conversation axis, if there are too little entries to use dynamic scaling numbers
+            if len(graphy) < 10:
+                plt_height = 7.5
+            # otherwise use the numbers above
+            else:
+                plt_height = len(graphy) * dist_mult_height
+
+            # use static scale along the x axis, if the x values are all 0
+            if max(graphx) < 200:  
+                plt_width = 7.5  # 7.5 as static width worked well
+                if max(graphx) == 0:
+                    ax.set_xlim(0, 10)
+            # otherwise use the numbers above
+            else:
+                plt_width = max(graphx) * dist_mult_width
+
+            title_distance = 1 + 0.012*52.8/plt_height  # orginally, a good title distance turned out to be 1.012 with a plot height of 52.8
+
+            plt.gcf().set_size_inches(plt_width, plt_height)  # set plot size
+
             # set additional plot parameters
             plt.title(title, y=title_distance)
             plt.xlabel(xlabel)
@@ -1194,16 +1215,15 @@ class Statistics:
             width = 0.5
             plt.grid(True)
             plt.gca().margins(y=0)  # removes the space between data and x-axis within the plot
-            plt.gcf().set_size_inches(plt_width, plt_height)  # set plot size
 
             # plot the above data, first use plain numbers as graphy to maintain sorting
-            plt.barh(range(len(graphy)), graphx, width, align='center', linewidth=1, color='red', edgecolor='red')
+            plt.barh(range(len(graphy)), graphx, width, align='center', linewidth=0.5, color='red', edgecolor='red')
             # now change the y numbers to the respective address labels
             plt.yticks(range(len(graphy)), graphy)
             # try to use tight layout to cut off unnecessary space
             try:
                 plt.tight_layout(pad=4)
-            except ValueError:
+            except (ValueError, numpy.linalg.linalg.LinAlgError):
                 pass
 
             # save created figure
@@ -1253,6 +1273,34 @@ class Statistics:
             # plot data and return outpath
             return plot_big_comm_interval_stat("avgTimeBetweenIntervals", "comm_interval_statistics", title, 'Average time between intervals', suffix)
 
+        def plot_avg_comm_interval_time(file_ending: str):
+            """
+            Plots the average duration of a communication interval of every connection. 
+
+            :param file_ending: The file extension for the output file containing the plot
+            :return: A filepath to the file containing the created plot
+            """
+
+            title = 'Average duration of a communication interval in seconds'
+            suffix = '_plot-Avg Duration Communication Interval Distribution' + file_ending
+
+            # plot data and return outpath
+            return plot_big_comm_interval_stat("avgIntervalTime", "comm_interval_statistics", title, 'Average interval time', suffix)
+
+        def plot_total_comm_duration(file_ending: str):
+            """
+            Plots the total communication duration of every connection. 
+
+            :param file_ending: The file extension for the output file containing the plot
+            :return: A filepath to the file containing the created plot
+            """
+
+            title = 'Total communication duration in seconds'
+            suffix = '_plot-Total Communication Duration Distribution' + file_ending
+
+            # plot data and return outpath
+            return plot_big_comm_interval_stat("totalCommDuration", "comm_interval_statistics", title, 'Duration', suffix)
+
 
         ttl_out_path = plot_ttl('.' + format)
         mss_out_path = plot_mss('.' + format)
@@ -1274,6 +1322,8 @@ class Statistics:
         plot_in_degree = plot_in_degree('.' + format)
         plot_avg_pkts_per_comm_interval_out = plot_avg_pkts_per_comm_interval('.' + format)
         plot_avg_time_between_comm_interval_out = plot_avg_time_between_comm_interval('.' + format)
+        plot_avg_comm_interval_time_out = plot_avg_comm_interval_time("." + format)
+        plot_total_comm_duration_out = plot_total_comm_duration("." + format)
 
         ## Time consuming plot
         # port_out_path = plot_port('.' + format)

+ 9 - 2
code_boost/src/cxx/statistics.cpp

@@ -289,6 +289,7 @@ void statistics::addConvStatStateless(std::string ipAddressSender,int sport,std:
  * @param dport The destination port.
  * @param timestamp The timestamp of the packet.
  */
+
 void statistics::addCommInterval(std::string ipAddressSender,int sport,std::string ipAddressReceiver,int dport, std::chrono::microseconds timestamp){
     conv f1 = {ipAddressReceiver, dport, ipAddressSender, sport};
     conv f2 = {ipAddressSender, sport, ipAddressReceiver, dport};
@@ -328,6 +329,7 @@ void statistics::addCommInterval(std::string ipAddressSender,int sport,std::stri
 /**
  * Aggregate the collected information about all communication intervals of every conversation.
  * Do this by computing the average packet rate per interval and the average time between intervals.
+ * Also compute average interval duration and total communication duration (i.e. last_msg.time - first_msg.time)
  * Note: here and within the function, conversation refers to a stateless conversation.
  */
 void statistics::createCommIntervalStats(){    
@@ -338,22 +340,27 @@ void statistics::createCommIntervalStats(){
 
         // if there is only one interval, the time between intervals cannot be computed and is therefore set to 0
         if (intervals.size() == 1){
-            entry_commIntervalStat e = {(double) intervals[0].pkts_count, (double) 0};
+            double interval_duration = (double) (intervals[0].end - intervals[0].start).count() / (double) 1e6;
+            entry_commIntervalStat e = {(double) intervals[0].pkts_count, (double) 0, interval_duration, interval_duration};
             comm_interval_statistics[cur_conv] = e;
         }
         // If there is more than one interval, compute the specified averages
         else if (intervals.size() > 1){
             long summed_pkts_count = intervals[0].pkts_count;
             std::chrono::microseconds time_between_ints_sum = (std::chrono::microseconds) 0;
+            std::chrono::microseconds summed_int_duration = intervals[0].end - intervals[0].start;
 
             for (int i = 1; i < intervals.size(); i++) {
                 summed_pkts_count += intervals[i].pkts_count;
+                summed_int_duration += intervals[i].end - intervals[i].start;
                 time_between_ints_sum += intervals[i].start - intervals[i - 1].end;
             }
 
             double avg_pkts_count = summed_pkts_count / ((double) intervals.size());
             double avg_time_betw_ints = (time_between_ints_sum.count() / (double) (intervals.size() - 1)) / (double) 1e6;
-            entry_commIntervalStat e = {avg_pkts_count, avg_time_betw_ints};
+            double avg_interval_time = (summed_int_duration.count() / (double) intervals.size()) / (double) 1e6;
+            double total_comm_duration = (double) (intervals.back().end - intervals.front().start).count() / (double) 1e6;
+            entry_commIntervalStat e = {avg_pkts_count, avg_time_betw_ints, avg_interval_time, total_comm_duration};
             comm_interval_statistics[cur_conv] = e;
         }
     }

+ 8 - 1
code_boost/src/cxx/statistics.h

@@ -287,14 +287,21 @@ struct commInterval{
  * Struct used to represent for the communication intervals of two hosts:
  * - Average time between intervals
  * - The average count of packets within an interval
+ * - The average duration of a communication interval
+ * - The overall communication time, i.e. last_msg.time - first_msg.time 
+ * Note: total_comm_duration != sum of all interval durations
  */
 struct entry_commIntervalStat{
     double avg_pkts_count;
     double avg_time_between;
+    double avg_interval_time;
+    double total_comm_duration;
 
     bool operator==(const entry_commIntervalStat &other) const {
         return avg_pkts_count == other.avg_pkts_count
-               && avg_time_between == other.avg_time_between;
+               && avg_time_between == other.avg_time_between
+               && avg_interval_time == other.avg_interval_time
+               && total_comm_duration == other.total_comm_duration;
     }    
 };
 

+ 5 - 1
code_boost/src/cxx/statistics_db.cpp

@@ -538,9 +538,11 @@ void statistics_db::writeCommIntervalStats(std::unordered_map<conv, entry_commIn
                 "portB INTEGER,"
                 "avgPktCount REAL,"
                 "avgTimeBetweenIntervals REAL,"
+                "avgIntervalTime REAL,"
+                "totalCommDuration REAL,"
                 "PRIMARY KEY(ipAddressA,portA,ipAddressB,portB));";
         db->exec(createTable);
-        SQLite::Statement query(*db, "INSERT INTO comm_interval_statistics VALUES (?, ?, ?, ?, ?, ?)");
+        SQLite::Statement query(*db, "INSERT INTO comm_interval_statistics VALUES (?, ?, ?, ?, ?, ?, ?, ?)");
 
         // iterate over every conversation and interval aggregation pair and store the respective values in the database
         for (auto it = commIntervalStatistics.begin(); it != commIntervalStatistics.end(); ++it) {
@@ -553,6 +555,8 @@ void statistics_db::writeCommIntervalStats(std::unordered_map<conv, entry_commIn
                 query.bind(4, f.portB);
                 query.bind(5, e.avg_pkts_count);
                 query.bind(6, e.avg_time_between);
+                query.bind(7, e.avg_interval_time);
+                query.bind(8, e.total_comm_duration);
                 
                 query.exec();
                 query.reset();