Browse Source

Refactor CSV and XML parsing

Move the parsing of input CSV files to C++, because Python took
way too long. For consistency, also move the parsing of XML files
to C++. Here the speed improvement seems to be almost negligible.
dustin.born 7 years ago
parent
commit
2c8ba4bd5a

+ 25 - 18
code/Attack/MembersMgmtCommAttack.py

@@ -6,11 +6,10 @@ from lea import Lea
 from datetime import datetime
 import os
 
+import ID2TLib.libbotnet as lb
 from Attack import BaseAttack
 from Attack.AttackParameters import Parameter as Param
 from Attack.AttackParameters import ParameterTypes
-# from ID2TLib import PcapFile
-# from ID2TLib.PcapFile import PcapFile
 from ID2TLib.Ports import PortSelectors
 
 class MessageType(Enum):
@@ -470,23 +469,26 @@ class MembersMgmtCommAttack(BaseAttack.BaseAttack):
         filepath_xml = self.get_param_value(Param.FILE_XML)
         filepath_csv = self.get_param_value(Param.FILE_CSV)
 
-        # prefer XML input over CSV input (in case both are given)
+        # use C++ communication processor for faster interval finding
+        cpp_comm_proc = lb.botnet_comm_processor();
+
+        # only use CSV input if the XML path is the default one
+        # --> prefer XML input over CSV input (in case both are given)
         if filepath_csv and filepath_xml == self.DEFAULT_XML_PATH:
-            filepath_xml = FileUtils.parse_csv_to_xml(filepath_csv)
+            filename = os.path.splitext(filepath_csv)[0]
+            cpp_comm_proc.parse_csv(filepath_csv)
+            filepath_xml = cpp_comm_proc.write_xml(filename)
             filepath_xml = move_xml_to_outdir(filepath_xml)
-
-
-        abstract_packets = FileUtils.parse_xml(filepath_xml)
+        else:
+            cpp_comm_proc.parse_xml(filepath_xml)
 
         # find a good communication mapping in the input file that matches the users parameters
         duration = self.get_param_value(Param.ATTACK_DURATION)
         number_init_bots = self.get_param_value(Param.NUMBER_INITIATOR_BOTS)
-        nat = self.get_param_value(Param.NAT_PRESENT)
-        comm_proc = CommunicationProcessor(abstract_packets, self.msg_types, nat)
 
-        comm_intervals = comm_proc.find_interval_most_comm(number_init_bots, duration)
+        comm_intervals = cpp_comm_proc.find_interval(number_init_bots, duration)
         if comm_intervals == []:
-            print("Error: There is no interval in the given CSV/XML that has enough communication initiating bots.")
+            print("Error: There is no interval in the given CSV/XML that has enough communicating initiating bots.")
             return []
         comm_interval = comm_intervals[randrange(0, len(comm_intervals))]
 
@@ -497,8 +499,17 @@ class MembersMgmtCommAttack(BaseAttack.BaseAttack):
             rm_idx = randrange(0, len(mapped_ids))
             del mapped_ids[rm_idx]
 
-        # assign the communication processor this mapping for further processing
-        comm_proc.set_mapping(abstract_packets[packet_start_idx:packet_end_idx+1], mapped_ids)
+        # get the messages contained in the chosen interval
+        abstract_packets = cpp_comm_proc.get_messages(packet_start_idx, packet_end_idx);
+        nat = self.get_param_value(Param.NAT_PRESENT)
+        # create a communication processor responsible for assigning roles and localities of IDs
+        comm_proc = CommunicationProcessor([], self.msg_types, nat)
+        comm_proc.set_mapping(abstract_packets, mapped_ids)
+        # determine ID roles and select the messages that are to be mapped into the PCAP
+        messages = comm_proc.det_id_roles_and_msgs()
+        # use the previously detetermined roles to assign the locality of all IDs
+        local_ids, external_ids = comm_proc.det_ext_and_local_ids()
+
         # print start and end time of mapped interval
         # print(abstract_packets[packet_start_idx]["Time"])
         # print(abstract_packets[packet_end_idx]["Time"])
@@ -511,15 +522,11 @@ class MembersMgmtCommAttack(BaseAttack.BaseAttack):
         reuse_count_external = int(reuse_percent_total * reuse_percent_external * len(mapped_ids))
         reuse_count_local = int(reuse_percent_total * reuse_percent_local * len(mapped_ids))
 
-        # create locality, IP and MAC configurations for the IDs/Bots
+        # create IP and MAC configurations for the IDs/Bots
         ipgen = Generator.IPGenerator()
         pcapops = PcapAddressOperations(self.statistics)
         router_mac = pcapops.get_probable_router_mac()
         bot_configs = {}
-        # determine the roles of the IDs in the mapping communication-{initiator, responder}
-        local_init_ids, external_init_ids, respnd_ids, messages = comm_proc.det_id_roles_and_msgs()
-        # use these roles to determine which IDs are to be local and which external
-        local_ids, external_ids = comm_proc.det_ext_and_local_ids()
 
         # retrieve and assign the IPs and MACs for the bots with respect to the given parameters
         # (IDs are always added to bot_configs in the same order under a given seed)

+ 3 - 17
code/ID2TLib/CommunicationProcessor.py

@@ -1,7 +1,6 @@
 from lea import Lea
 from Attack.MembersMgmtCommAttack import MessageType
 from Attack.MembersMgmtCommAttack import Message
-import ID2TLib.libbotnet as bcp
 
 # needed because of machine inprecision. E.g A time difference of 0.1s is stored as >0.1s
 EPS_TOLERANCE = 1e-13  # works for a difference of 0.1, no less
@@ -39,26 +38,13 @@ class CommunicationProcessor():
         self.packets = packets
         self.local_init_ids = set(mapped_ids)
 
-    def find_interval_most_comm(self, number_ids: int, max_int_time: float):
-        botproc = bcp.botnet_comm_processor(self.packets)
-        cpp_intervals = botproc.find_interval(number_ids, max_int_time)
-        intervals = []
-        for cpp_interval in cpp_intervals:
-            ids = []
-            for id_ in cpp_interval[0]:
-                ids.append(str(id_))
-            interval = {"IDs": ids, "Start": cpp_interval[1], "End": cpp_interval[2]}
-            intervals.append(interval)
-
-        return intervals
-
     def det_id_roles_and_msgs(self):
         """
         Determine the role of every mapped ID. The role can be initiator, responder or both.
         On the side also connect corresponding messages together to quickly find out
         which reply belongs to which request and vice versa.
 
-        :return: a triple as (initiator IDs, responder IDs, messages)
+        :return: the selected messages
         """
 
         mtypes = self.mtypes
@@ -144,8 +130,8 @@ class CommunicationProcessor():
         self.external_init_ids = sorted(external_init_ids)
         self.messages = msgs
 
-        # return the retrieved information
-        return self.local_init_ids, self.external_init_ids, self.respnd_ids, self.messages
+        # return the selected messages
+        return self.messages
 
     def det_ext_and_local_ids(self, prob_rspnd_local: int=0):
         """

+ 1 - 1
code_boost/src/CMakeLists.txt

@@ -25,7 +25,7 @@ SET(CMAKE_CXX_STANDARD_REQUIRED ON)
 # Add the library source files
 SET(SOURCE_FILES cxx/pcap_processor.cpp cxx/pcap_processor.h cxx/statistics.cpp cxx/statistics.h cxx/statistics_db.cpp cxx/statistics_db.h cxx/utilities.h cxx/utilities.cpp)
 
-# Add botnet communication processor source files
+# Add botnet comm processor source files
 SET(BOT_COMM_PROC_SOURCE cxx/botnet_comm_processor.h cxx/botnet_comm_processor.cpp)
 
 # Include SQLiteCpp library and build it

+ 179 - 26
code_boost/src/cxx/botnet_comm_processor.cpp

@@ -1,6 +1,5 @@
 #include "botnet_comm_processor.h"
 
-// Use references instead of values to save time?
 
 /**
  * Creates a new botnet_comm_processor object. 
@@ -8,18 +7,177 @@
  * @param messages_pyboost The abstract communication messages 
  *    represented as (python) list containing (python) dicts.
  */
-botnet_comm_processor::botnet_comm_processor(py::list messages_pyboost){
+botnet_comm_processor::botnet_comm_processor(const py::list &messages_pyboost){
     for (int i = 0; i < len(messages_pyboost); i++){
         py::dict msg_pyboost = py::extract<py::dict>(messages_pyboost[i]);
         unsigned int src_id = std::stoi(py::extract<std::string>(msg_pyboost["Src"]));
         unsigned int dst_id = std::stoi(py::extract<std::string>(msg_pyboost["Dst"]));
         unsigned short type = (unsigned short) std::stoi(py::extract<std::string>(msg_pyboost["Type"]));
         double time = std::stod(py::extract<std::string>(msg_pyboost["Time"]));
-        abstract_msg msg = {src_id, dst_id, type, time};
+        int line_no = std::stoi(py::extract<std::string>(msg_pyboost["LineNumber"]));
+        abstract_msg msg = {src_id, dst_id, type, time, line_no};
         messages.push_back(msg);
     }
 }
 
+/**
+ * Creates a new and empty botnet_comm_processor object.
+ */
+botnet_comm_processor::botnet_comm_processor(){
+}
+
+/**
+ * Processes an XML attribute assignment. The result is reflected in the respective change of the given message.
+ * @param msg The message this attribute refers to.
+ * @param assignment The XML attribute assignment in notation: attribute="value"
+ */
+void botnet_comm_processor::process_xml_attrib_assign(abstract_msg &msg, const std::string &assignment) {
+    int split_pos = assignment.find("=");
+    if (split_pos != std::string::npos){
+        std::string key = assignment.substr(0, split_pos);
+        std::string value = assignment.substr(split_pos + 2, assignment.length() - 1);
+        process_kv(msg, key, value);
+    }
+}
+
+/**
+ * Processes a key-value pair. The result is reflected in the respective change of the given message.
+ * @param msg The message this kv pair refers to.
+ * @param key The key of the attribute.
+ * @param value The value of the attribute.
+ */
+void botnet_comm_processor::process_kv(abstract_msg &msg, const std::string &key, const std::string &value){
+    if (key == "Src")
+        msg.src = std::stoi(value);
+    else if (key == "Dst")
+        msg.dst = std::stoi(value);
+    else if (key == "Type")
+        msg.type = (unsigned short) std::stoi(value);
+    else if (key == "Time")
+        msg.time = std::stod(value);
+    else if (key == "LineNumber")
+        msg.line_no = std::stoi(value);
+}
+
+/**
+ * Parses the packets contained in the given CSV to program structure.
+ * @param filepath The filepath where the CSV is located.
+ * @return The number of messages (or lines) contained in the CSV file.
+ */
+unsigned int botnet_comm_processor::parse_csv(const std::string &filepath){
+    std::ifstream input(filepath);
+    int line_no = 0;
+
+    messages.clear();
+    // iterate over every line
+    for (std::string line; std::getline(input, line); ){
+        std::istringstream line_stream(line);
+        abstract_msg cur_msg;
+        cur_msg.line_no = line_no;
+        // iterate over every key:value entry
+        for (std::string pair; std::getline(line_stream, pair, ','); ){
+            boost::replace_all(pair, " ", "");
+            int split_pos = pair.find(":");
+            if (split_pos != std::string::npos){
+                std::string key = pair.substr(0, split_pos);
+                std::string value = pair.substr(split_pos + 1, pair.length());
+                process_kv(cur_msg, key, value);
+            }
+        }
+        messages.push_back(std::move(cur_msg));
+        line_no++;
+    }
+    return messages.size();
+}
+
+/**
+ * Parses the packets contained in the given XML to program structure.
+ * @param filepath The filepath where the XML is located.
+ * @return The number of messages contained in the XML file.
+ */
+unsigned int botnet_comm_processor::parse_xml(const std::string &filepath){
+    std::ifstream input(filepath);
+    std::string cur_word = "";
+    abstract_msg cur_msg;
+    char c;
+    int read_packet_open = 0, read_slash = 0;
+
+    messages.clear();
+    // iterate over every character
+    while (input.get(c)){
+        if(c == '/')  // hints ending of tag
+            read_slash = 1;
+        else if (c == '>'){  // definitely closes tag
+            if (read_packet_open && read_slash){  // handle oustanding attribute
+                read_slash = 0;
+                process_xml_attrib_assign(cur_msg, cur_word);
+                messages.push_back(cur_msg);
+                read_packet_open = 0;
+            }
+            cur_word = "";
+        }
+        else if (c == ' '){
+            if (read_packet_open && cur_word != ""){  // handle new attribute
+                process_xml_attrib_assign(cur_msg, cur_word);
+            }
+            else if (cur_word == "<packet")
+                read_packet_open = 1;
+
+            cur_word = "";
+        }
+        else
+            cur_word += c;
+    }
+    return messages.size();
+}
+
+/**
+ * Writes the communication messages contained in the class member messages into an XML file (with respective notation).
+ * @param filename The name the file should have (without extension).
+ * @return The filepath of the written XML file.
+ */
+std::string botnet_comm_processor::write_xml(const std::string &filename){
+    std::string filepath = filename + ".xml";
+
+    std::ofstream xml_file;
+    xml_file.open(filepath);
+
+    // set number of digits after dot to 11
+    xml_file << std::fixed << std::setprecision(11);
+
+    xml_file << "<trace name=\"" << filename << "\">";
+    for (const auto &msg : messages){
+        xml_file << "<packet ";
+        xml_file << "Src=\"" << msg.src << "\" Dst=\"" << msg.dst << "\" ";
+        xml_file << "Type=\"" << msg.type << "\" Time=\"" << msg.time << "\" ";
+        xml_file << "LineNumber=\"" << msg.line_no << "\" />";
+    }
+    xml_file << "</trace>";
+
+    xml_file.close();
+    return filepath;
+}
+
+/**
+ * Retrieves all messages contained in the interval between start_idx and end_idx in Python representation.
+ * @param start_idx The inclusive first index of the interval.
+ * @param end_idx The inclusive last index of the interval.
+ * @return A (Python) list of (Python) dicts containing the desired information.
+ */
+py::list botnet_comm_processor::get_messages(unsigned int start_idx, unsigned int end_idx){
+    py::list py_messages;
+    for (int i = start_idx; i <= end_idx; i++){
+        py::dict py_msg;
+        py_msg["Src"] = messages[i].src;
+        py_msg["Dst"] = messages[i].dst;
+        py_msg["Type"] = messages[i].type;
+        py_msg["Time"] = messages[i].time;
+        py_msg["LineNumber"] = messages[i].line_no;
+        py_messages.append(py_msg);
+    }
+    return py_messages;
+}
+
 /**
  * Finds the time interval(s) of the given seconds with the most overall communication
  * (i.e. requests and responses) that has at least number_ids communicating initiators in it. 
@@ -71,7 +229,7 @@ py::list botnet_comm_processor::find_interval(int number_ids, double max_int_tim
     }
 
     // return the result converted into python data structures
-    return convert_to_py_repr(possible_intervals);
+    return convert_intervals_to_py_repr(possible_intervals);
 }
 
 /**
@@ -81,7 +239,7 @@ py::list botnet_comm_processor::find_interval(int number_ids, double max_int_tim
  * @param number_ids The number of initiator IDs that have to exist in the interval(s).
  * @param max_int_time The maximum time period of the interval.
  * @param start_idx The index of the first message to process with respect to the class member 'messages'.
- * @param end_idx The upper index boundary where the search is stopped at (i.e. idx_low does not cross this boundary).
+ * @param end_idx The upper index boundary where the search is stopped at (i.e. exclusive index).
  */
 void botnet_comm_processor::find_interval_helper(std::promise<std::vector<comm_interval> > && p, int number_ids, double max_int_time, int start_idx, int end_idx){
     // setup initial variables
@@ -178,41 +336,30 @@ int botnet_comm_processor::msgtype_is_response(unsigned short mtype){
     return mtype == SALITY_HELLO_REPLY || mtype == SALITY_NL_REPLY;
 }
 
-// py::list botnet_comm_processor::std_vector_to_py_list(const std::vector<comm_interval> &intervals){
-//     py::object get_iter = py::iterator<std::vector<comm_interval> >();
-//     py::object iter = get_iter(intervals);
-//     py::list l(iter);
-//     return l;
-// }
-
-// py::list botnet_comm_processor::st_unorderedmap_to_py_dict(const std::vector<comm_interval> &intervals){
-//     py::object get_iter = py::iterator<std::vector<comm_interval> >();
-//     py::object iter = get_iter(intervals);
-//     py::list l(iter);
-//     return l;
-// }
-
 /**
  * Converts the given vector of communication intervals to a python representation 
  * using (python) lists and (python) tuples.
  * @param intervals The communication intervals to convert.
- * @return A boost::python::list containing the same information using boost::python::tuples for each interval.
+ * @return A boost::python::list containing the same information using boost::python::dict for each interval.
  */
-py::list botnet_comm_processor::convert_to_py_repr(const std::vector<comm_interval> &intervals){
+py::list botnet_comm_processor::convert_intervals_to_py_repr(const std::vector<comm_interval> &intervals){
     py::list py_intervals;
     for (const auto &interval : intervals){
         py::list py_ids;
         for (const auto &id : interval.ids){
             py_ids.append(id);
         }
-        py::tuple py_interval = py::make_tuple(py_ids, interval.start_idx, interval.end_idx);
+        py::dict py_interval;
+        py_interval["IDs"] = py_ids;
+        py_interval["Start"] = interval.start_idx;
+        py_interval["End"] = interval.end_idx;
         py_intervals.append(py_interval);
     }
     return py_intervals;
 }
 
-// void botnet_comm_processor::print_message(abstract_msg message){
-//     std::cout << "Src: " << message.src << "   Dst: " << message.dst << "   Type: " << message.type << "   Time: " << message.time << std::endl;
+// void botnet_comm_processor::print_message(const abstract_msg &message){
+//     std::cout << "Src: " << message.src << "   Dst: " << message.dst << "   Type: " << message.type << "   Time: " << message.time << "   LineNumber: " << message.line_no << std::endl;
 // }
 
 
@@ -224,6 +371,12 @@ py::list botnet_comm_processor::convert_to_py_repr(const std::vector<comm_interv
 using namespace boost::python;
 
 BOOST_PYTHON_MODULE (libbotnet) {
-    class_<botnet_comm_processor>("botnet_comm_processor", init<list>())
-            .def("find_interval", &botnet_comm_processor::find_interval);
+    class_<botnet_comm_processor>("botnet_comm_processor")
+            .def(init<list>())
+            .def(init<>())
+            .def("find_interval", &botnet_comm_processor::find_interval)
+            .def("parse_csv", &botnet_comm_processor::parse_csv)
+            .def("parse_xml", &botnet_comm_processor::parse_xml)
+            .def("get_messages", &botnet_comm_processor::get_messages)
+            .def("write_xml", &botnet_comm_processor::write_xml);
 }

+ 47 - 11
code_boost/src/cxx/botnet_comm_processor.h

@@ -8,12 +8,19 @@
 
 #include <iostream>
 #include <boost/python.hpp>
+#include <boost/algorithm/string/replace.hpp>
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/xml_parser.hpp>
 #include <vector>
 #include <chrono>
 #include <thread>
 #include <deque>
 #include <set>
 #include <future>
+#include <fstream>
+#include <string>
+#include <istream>
+
 
 /*
  * Botnet communication types (equal to the ones contained in the MessageType class in MembersMgmtCommAttack.py)
@@ -33,6 +40,7 @@
  * For quick usage
  */
 namespace py = boost::python;
+namespace pt = boost::property_tree;
 
 /*
  * Definition of structs
@@ -46,10 +54,18 @@ namespace py = boost::python;
  * - Time of message
  */
 struct abstract_msg {
-    unsigned int src;
-    unsigned int dst;
-    unsigned short type; 
-    double time;
+    // necessary constructors to have default values
+    abstract_msg (unsigned int src, unsigned int dst, unsigned short type, double time, int line_no) :
+    src(src), dst(dst), type(type), time(time), line_no(line_no) {}
+
+    abstract_msg () {}
+
+    // members
+    unsigned int src = 0;
+    unsigned int dst = 0;
+    unsigned short type = 0; 
+    double time = 0.0;
+    int line_no = -1;
 };
 
 /*
@@ -76,30 +92,50 @@ int greater_than(double a, double b){
     return b - a < -1 * EPS_TOLERANCE;
 }
 
+
 class botnet_comm_processor {
 
 public:
     /*
     * Class constructor
     */
-    botnet_comm_processor(py::list packets);
-    
+    botnet_comm_processor();
+
+    botnet_comm_processor(const py::list &messages_pyboost);
+
     /*
      * Methods
      */
     py::list find_interval(int number_ids, double max_int_time);
 
+    py::list get_messages(unsigned int start_idx, unsigned int end_idx);
+
+    unsigned int parse_csv(const std::string &);
+
+    unsigned int parse_xml(const std::string &);
+
+    std::string write_xml(const std::string &);
+
 private:
     /*
      * Methods
      */
-    void print_message(abstract_msg packet);
+    py::list convert_intervals_to_py_repr(const std::vector<comm_interval>& intervals);
+
+    void find_interval_helper(std::promise<std::vector<comm_interval> > && p, int number_ids, double max_int_time, int start_idx, int end_idx);
+
     int msgtype_is_request(unsigned short mtype);
+
     int msgtype_is_response(unsigned short mtype);
-    // py::list std_vector_to_py_list(std::vector<comm_interval> intervals)
-    py::list convert_to_py_repr(const std::vector<comm_interval>& intervals);
-    void find_interval_helper(std::promise<std::vector<comm_interval> > && p, int number_ids, double max_int_time, int start_idx, int end_idx);
-    
+
+    // void print_message(const abstract_msg &message);
+
+    void process_csv_attrib(abstract_msg &msg, const std::string &cur_word);
+
+    void process_kv(abstract_msg &msg, const std::string &key, const std::string &value);
+
+    void process_xml_attrib_assign(abstract_msg &msg, const std::string &cur_word);
+
     /*
      * Attributes
      */