botnet_comm_processor.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. #include "botnet_comm_processor.h"
  2. /**
  3. * Creates a new botnet_comm_processor object.
  4. * The abstract python messages are converted to easier-to-handle C++ data structures.
  5. * @param messages_pyboost The abstract communication messages
  6. * represented as (python) list containing (python) dicts.
  7. */
  8. botnet_comm_processor::botnet_comm_processor(const py::list &messages_pyboost){
  9. set_messages(messages_pyboost);
  10. }
  11. /**
  12. * Creates a new and empty botnet_comm_processor object.
  13. */
  14. botnet_comm_processor::botnet_comm_processor(){
  15. }
  16. /**
  17. * Set the messages of this communication processor.
  18. * @param messages_pyboost The abstract communication messages
  19. * represented as (python) list containing (python) dicts.
  20. */
  21. void botnet_comm_processor::set_messages(const py::list &messages_pyboost){
  22. messages.clear();
  23. for (int i = 0; i < len(messages_pyboost); i++){
  24. py::dict msg_pyboost = py::extract<py::dict>(messages_pyboost[i]);
  25. unsigned int src_id = std::stoi(py::extract<std::string>(msg_pyboost["Src"]));
  26. unsigned int dst_id = std::stoi(py::extract<std::string>(msg_pyboost["Dst"]));
  27. unsigned short type = (unsigned short) std::stoi(py::extract<std::string>(msg_pyboost["Type"]));
  28. double time = std::stod(py::extract<std::string>(msg_pyboost["Time"]));
  29. int line_no = std::stoi(py::extract<std::string>(msg_pyboost.get("LineNumber", "-1")));
  30. abstract_msg msg = {src_id, dst_id, type, time, line_no};
  31. messages.push_back(std::move(msg));
  32. }
  33. }
  34. /**
  35. * Retrieve input information about message count.
  36. * @return the number of existing messages.
  37. */
  38. int botnet_comm_processor::get_message_count(){
  39. return messages.size();
  40. }
  41. /**
  42. * Processes an XML attribute assignment. The result is reflected in the respective change of the given message.
  43. * @param msg The message this attribute refers to.
  44. * @param assignment The XML attribute assignment in notation: attribute="value"
  45. */
  46. void botnet_comm_processor::process_xml_attrib_assign(abstract_msg &msg, const std::string &assignment) {
  47. int split_pos = assignment.find("=");
  48. if (split_pos != std::string::npos){
  49. std::string key = assignment.substr(0, split_pos);
  50. std::string value = assignment.substr(split_pos + 2, assignment.length() - 1);
  51. process_kv(msg, key, value);
  52. }
  53. }
  54. /**
  55. * Processes a key-value pair. The result is reflected in the respective change of the given message.
  56. * @param msg The message this kv pair refers to.
  57. * @param key The key of the attribute.
  58. * @param value The value of the attribute.
  59. */
  60. void botnet_comm_processor::process_kv(abstract_msg &msg, const std::string &key, const std::string &value){
  61. if (key == "Src")
  62. msg.src = std::stoi(value);
  63. else if (key == "Dst")
  64. msg.dst = std::stoi(value);
  65. else if (key == "Type")
  66. msg.type = (unsigned short) std::stoi(value);
  67. else if (key == "Time")
  68. msg.time = std::stod(value);
  69. else if (key == "LineNumber")
  70. msg.line_no = std::stoi(value);
  71. }
  72. /**
  73. * Parses the packets contained in the given CSV to program structure.
  74. * @param filepath The filepath where the CSV is located.
  75. * @return The number of messages (or lines) contained in the CSV file.
  76. */
  77. unsigned int botnet_comm_processor::parse_csv(const std::string &filepath){
  78. std::ifstream input(filepath);
  79. int line_no = 0;
  80. messages.clear();
  81. // iterate over every line
  82. for (std::string line; std::getline(input, line); ){
  83. std::istringstream line_stream(line);
  84. abstract_msg cur_msg;
  85. cur_msg.line_no = line_no;
  86. // iterate over every key:value entry
  87. for (std::string pair; std::getline(line_stream, pair, ','); ){
  88. boost::replace_all(pair, " ", "");
  89. int split_pos = pair.find(":");
  90. if (split_pos != std::string::npos){
  91. std::string key = pair.substr(0, split_pos);
  92. std::string value = pair.substr(split_pos + 1, pair.length());
  93. process_kv(cur_msg, key, value);
  94. }
  95. }
  96. messages.push_back(std::move(cur_msg));
  97. line_no++;
  98. }
  99. return messages.size();
  100. }
  101. /**
  102. * Parses the packets contained in the given XML to program structure.
  103. * @param filepath The filepath where the XML is located.
  104. * @return The number of messages contained in the XML file.
  105. */
  106. unsigned int botnet_comm_processor::parse_xml(const std::string &filepath){
  107. std::ifstream input(filepath);
  108. std::string cur_word = "";
  109. abstract_msg cur_msg;
  110. char c;
  111. int read_packet_open = 0, read_slash = 0;
  112. messages.clear();
  113. // iterate over every character
  114. while (input.get(c)){
  115. if(c == '/') // hints ending of tag
  116. read_slash = 1;
  117. else if (c == '>'){ // definitely closes tag
  118. if (read_packet_open && read_slash){ // handle oustanding attribute
  119. read_slash = 0;
  120. process_xml_attrib_assign(cur_msg, cur_word);
  121. messages.push_back(cur_msg);
  122. read_packet_open = 0;
  123. }
  124. cur_word = "";
  125. }
  126. else if (c == ' '){
  127. if (read_packet_open && cur_word != ""){ // handle new attribute
  128. process_xml_attrib_assign(cur_msg, cur_word);
  129. }
  130. else if (cur_word == "<packet")
  131. read_packet_open = 1;
  132. cur_word = "";
  133. }
  134. else
  135. cur_word += c;
  136. }
  137. return messages.size();
  138. }
  139. /**
  140. * Writes the communication messages contained in the class member messages into an XML file (with respective notation).
  141. * @param filename The name the file should have (without extension).
  142. * @return The filepath of the written XML file.
  143. */
  144. std::string botnet_comm_processor::write_xml(const std::string &filename){
  145. std::string filepath = filename + ".xml";
  146. std::ofstream xml_file;
  147. xml_file.open(filepath);
  148. // set number of digits after dot to 11
  149. xml_file << std::fixed << std::setprecision(11);
  150. xml_file << "<trace name=\"" << filename << "\">";
  151. for (const auto &msg : messages){
  152. xml_file << "<packet ";
  153. xml_file << "Src=\"" << msg.src << "\" Dst=\"" << msg.dst << "\" ";
  154. xml_file << "Type=\"" << msg.type << "\" Time=\"" << msg.time << "\" ";
  155. xml_file << "LineNumber=\"" << msg.line_no << "\" />";
  156. }
  157. xml_file << "</trace>";
  158. xml_file.close();
  159. return filepath;
  160. }
  161. /**
  162. * Retrieves all messages contained in the interval between start_idx and end_idx in Python representation.
  163. * @param start_idx The inclusive first index of the interval.
  164. * @param end_idx The inclusive last index of the interval.
  165. * @return A (Python) list of (Python) dicts containing the desired information.
  166. */
  167. py::list botnet_comm_processor::get_messages(unsigned int start_idx, unsigned int end_idx){
  168. py::list py_messages;
  169. for (int i = start_idx; i <= end_idx; i++){
  170. if (i >= messages.size())
  171. break;
  172. py::dict py_msg;
  173. py_msg["Src"] = messages[i].src;
  174. py_msg["Dst"] = messages[i].dst;
  175. py_msg["Type"] = messages[i].type;
  176. py_msg["Time"] = messages[i].time;
  177. py_msg["LineNumber"] = messages[i].line_no;
  178. py_messages.append(py_msg);
  179. }
  180. return py_messages;
  181. }
  182. /**
  183. * Finds the time interval(s) of maximum the given seconds with the most overall communication
  184. * (i.e. requests and responses) that has at least number_ids communicating initiators in it.
  185. * @param number_ids The number of initiator IDs that have to exist in the interval(s).
  186. * @param max_int_time The maximum time period of the interval.
  187. * @return A (python) list of (python) dicts, where each dict (keys: 'IDs', Start', 'End') represents an interval with its
  188. * list of initiator IDs, a start index and an end index. The indices are with respect to the first abstract message.
  189. */
  190. py::list botnet_comm_processor::find_optimal_interval(int number_ids, double max_int_time){
  191. unsigned int logical_thread_count = std::thread::hardware_concurrency();
  192. std::vector<std::thread> threads;
  193. std::vector<std::future<std::vector<comm_interval> > > futures;
  194. // create as many threads as can run concurrently and assign them respective sections
  195. for (int i = 0; i < logical_thread_count; i++){
  196. unsigned int start_idx = (i * messages.size() / logical_thread_count);
  197. unsigned int end_idx = (i + 1) * messages.size() / logical_thread_count;
  198. std::promise<std::vector<comm_interval> > p; // use promises to retrieve return values
  199. futures.push_back(p.get_future());
  200. threads.push_back(std::thread(&botnet_comm_processor::find_optimal_interval_helper, this, std::move(p), number_ids, max_int_time, start_idx, end_idx));
  201. }
  202. // synchronize all threads
  203. for (auto &t : threads){
  204. t.join();
  205. }
  206. // accumulate results
  207. std::vector<std::vector<comm_interval> > acc_possible_intervals;
  208. for (auto &f : futures){
  209. acc_possible_intervals.push_back(f.get());
  210. }
  211. // find overall most communicative interval
  212. std::vector<comm_interval> possible_intervals;
  213. unsigned int cur_highest_sum = 0;
  214. for (const auto &single_poss_interval : acc_possible_intervals){
  215. if (single_poss_interval.size() > 0 && single_poss_interval[0].comm_sum >= cur_highest_sum){
  216. // if there is more than one interval, all of them have the same comm_sum
  217. if (single_poss_interval[0].comm_sum > cur_highest_sum){
  218. cur_highest_sum = single_poss_interval[0].comm_sum;
  219. possible_intervals.clear();
  220. }
  221. for (const auto &interval : single_poss_interval){
  222. possible_intervals.push_back(std::move(interval));
  223. }
  224. }
  225. }
  226. // return the result converted into python data structures
  227. return convert_intervals_to_py_repr(possible_intervals);
  228. }
  229. /**
  230. * Finds the time interval(s) of maximum the given seconds within the given start and end index having the most
  231. * overall communication (i.e. requests and responses) as well as at least number_ids communicating initiators in it.
  232. * @param p An rvalue to a promise to return the found intervals.
  233. * @param number_ids The number of initiator IDs that have to exist in the interval(s).
  234. * @param max_int_time The maximum time period of the interval.
  235. * @param start_idx The index of the first message to process with respect to the class member 'messages'.
  236. * @param end_idx The upper index boundary where the search is stopped at (i.e. exclusive index).
  237. */
  238. void botnet_comm_processor::find_optimal_interval_helper(std::promise<std::vector<comm_interval> > && p, int number_ids, double max_int_time, int start_idx, int end_idx){
  239. // setup initial variables
  240. unsigned int idx_low = start_idx, idx_high = start_idx; // the indices spanning the interval
  241. unsigned int comm_sum = 0; // the communication sum of the current interval
  242. unsigned int cur_highest_sum = 0; // the highest communication sum seen so far
  243. double cur_int_time = 0; // the time of the current interval
  244. std::deque<unsigned int> init_ids; // the initiator IDs seen in the current interval in order of appearance
  245. std::vector<comm_interval> possible_intervals; // all intervals that have cur_highest_sum of communication and contain enough IDs
  246. // Iterate over all messages from start to finish and process the info of each message.
  247. // Similar to a Sliding Window approach.
  248. while (1){
  249. if (idx_high < messages.size())
  250. cur_int_time = messages[idx_high].time - messages[idx_low].time;
  251. // if current interval time exceeds maximum time period or all messages have been processed,
  252. // process information of the current interval
  253. if (greater_than(cur_int_time, max_int_time) || idx_high >= messages.size()){
  254. std::set<unsigned int> interval_ids;
  255. for (int i = 0; i < init_ids.size(); i++)
  256. interval_ids.insert(init_ids[i]);
  257. // if the interval contains enough initiator IDs, add it to possible_intervals
  258. if (interval_ids.size() >= number_ids){
  259. comm_interval interval = {interval_ids, comm_sum, idx_low, idx_high - 1};
  260. // reset possible intervals if new maximum of communication is found
  261. if (comm_sum > cur_highest_sum){
  262. possible_intervals.clear();
  263. possible_intervals.push_back(std::move(interval));
  264. cur_highest_sum = comm_sum;
  265. }
  266. // append otherwise
  267. else if (comm_sum == cur_highest_sum)
  268. possible_intervals.push_back(std::move(interval));
  269. }
  270. // stop if all messages have been processed
  271. if (idx_high >= messages.size())
  272. break;
  273. }
  274. // let idx_low "catch up" so that the current interval time fits into the maximum time period again
  275. while (greater_than(cur_int_time, max_int_time)){
  276. if (idx_low >= end_idx)
  277. goto end;
  278. abstract_msg &cur_msg = messages[idx_low];
  279. // if message was not a timeout, delete the first appearance of the initiator ID
  280. // of this message from the initiator list and update comm_sum
  281. if (cur_msg.type != TIMEOUT){
  282. comm_sum -= 1;
  283. init_ids.pop_front();
  284. }
  285. idx_low++;
  286. cur_int_time = messages[idx_high].time - messages[idx_low].time;
  287. }
  288. // consume the new message at idx_high and process its information
  289. abstract_msg &cur_msg = messages[idx_high];
  290. // if message is request, add src to initiator list
  291. if (msgtype_is_request(cur_msg.type)){
  292. init_ids.push_back(cur_msg.src);
  293. comm_sum += 1;
  294. }
  295. // if message is response, add dst to initiator list
  296. else if (msgtype_is_response(cur_msg.type)){
  297. init_ids.push_back(cur_msg.dst);
  298. comm_sum += 1;
  299. }
  300. idx_high += 1;
  301. }
  302. end: p.set_value(possible_intervals);
  303. }
  304. /**
  305. * Finds the time interval of maximum the given seconds starting at the given index. If it does not have at least number_ids
  306. * communicating initiators in it or the index is out of bounds, an empty dict is returned.
  307. * @param number_ids The number of initiator IDs that have to exist in the interval.
  308. * @param max_int_time The maximum time period of the interval.
  309. * @return A (python) dict (keys: 'IDs', Start', 'End'), which represents an interval with its list of initiator IDs,
  310. * a start index and an end index. The indices are with respect to the first abstract message.
  311. */
  312. py::dict botnet_comm_processor::find_interval_from_startidx(int start_idx, int number_ids, double max_int_time){
  313. // setup initial variables
  314. unsigned int cur_idx = start_idx; // the current iteration index
  315. double cur_int_time = 0; // the time of the current interval
  316. std::deque<unsigned int> init_ids; // the initiator IDs seen in the current interval in order of appearance
  317. py::dict comm_interval_py; // the communication interval that is returned
  318. if (start_idx >= messages.size()){
  319. return comm_interval_py;
  320. }
  321. // Iterate over all messages starting at start_idx until the duration or the current index exceeds a boundary
  322. while (1){
  323. if (cur_idx < messages.size())
  324. cur_int_time = messages[cur_idx].time - messages[start_idx].time;
  325. // if current interval time exceeds maximum time period or all messages have been processed,
  326. // process information of the current interval
  327. if (greater_than(cur_int_time, max_int_time) || cur_idx >= messages.size()){
  328. std::set<unsigned int> interval_ids;
  329. for (int i = 0; i < init_ids.size(); i++)
  330. interval_ids.insert(init_ids[i]);
  331. // if the interval contains enough initiator IDs, convert it to python representation and return it
  332. if (interval_ids.size() >= number_ids){
  333. py::list py_ids;
  334. for (const auto &id : interval_ids){
  335. py_ids.append(id);
  336. }
  337. comm_interval_py["IDs"] = py_ids;
  338. comm_interval_py["Start"] = start_idx;
  339. comm_interval_py["End"] = cur_idx - 1;
  340. return comm_interval_py;
  341. }
  342. else {
  343. return comm_interval_py;
  344. }
  345. }
  346. // consume the new message at cur_idx and process its information
  347. abstract_msg &cur_msg = messages[cur_idx];
  348. // if message is request, add src to initiator list
  349. if (msgtype_is_request(cur_msg.type))
  350. init_ids.push_back(cur_msg.src);
  351. // if message is response, add dst to initiator list
  352. else if (msgtype_is_response(cur_msg.type))
  353. init_ids.push_back(cur_msg.dst);
  354. cur_idx += 1;
  355. }
  356. }
  357. /**
  358. * Finds all initiator IDs contained in the interval spanned by the two indices.
  359. * @param start_idx The start index of the interval.
  360. * @param end_idx The last index of the interval (inclusive).
  361. * @return A (python) list containing all initiator IDs of the interval.
  362. */
  363. py::list botnet_comm_processor::get_interval_init_ids(int start_idx, int end_idx){
  364. // setup initial variables
  365. unsigned int cur_idx = start_idx; // the current iteration index
  366. std::set<unsigned int> interval_ids;
  367. py::list py_ids; // the communication interval that is returned
  368. if (start_idx >= messages.size()){
  369. return py_ids;
  370. }
  371. // Iterate over all messages starting at start_idx until the duration or the current index exceeds a boundary
  372. while (1){
  373. // if messages have been processed
  374. if (cur_idx >= messages.size() || cur_idx > end_idx){
  375. for (const auto &id : interval_ids)
  376. py_ids.append(id);
  377. return py_ids;
  378. }
  379. // consume the new message at cur_idx and process its information
  380. abstract_msg &cur_msg = messages[cur_idx];
  381. // if message is request, add src to initiator list
  382. if (msgtype_is_request(cur_msg.type))
  383. interval_ids.insert(cur_msg.src);
  384. // if message is response, add dst to initiator list
  385. else if (msgtype_is_response(cur_msg.type))
  386. interval_ids.insert(cur_msg.dst);
  387. cur_idx += 1;
  388. }
  389. }
  390. /**
  391. * Checks whether the given message type corresponds to a request.
  392. * @param mtype The message type to check.
  393. * @return true(1) if the message type is a request, false(0) otherwise.
  394. */
  395. int botnet_comm_processor::msgtype_is_request(unsigned short mtype){
  396. return mtype == SALITY_HELLO || mtype == SALITY_NL_REQUEST;
  397. }
  398. /**
  399. * Checks whether the given message type corresponds to a response.
  400. * @param mtype The message type to check.
  401. * @return true(1) if the message type is a response, false(0) otherwise.
  402. */
  403. int botnet_comm_processor::msgtype_is_response(unsigned short mtype){
  404. return mtype == SALITY_HELLO_REPLY || mtype == SALITY_NL_REPLY;
  405. }
  406. /**
  407. * Converts the given vector of communication intervals to a python representation
  408. * using (python) lists and (python) tuples.
  409. * @param intervals The communication intervals to convert.
  410. * @return A boost::python::list containing the same interval information using boost::python::dict for each interval.
  411. */
  412. py::list botnet_comm_processor::convert_intervals_to_py_repr(const std::vector<comm_interval> &intervals){
  413. py::list py_intervals;
  414. for (const auto &interval : intervals){
  415. py::list py_ids;
  416. for (const auto &id : interval.ids){
  417. py_ids.append(id);
  418. }
  419. py::dict py_interval;
  420. py_interval["IDs"] = py_ids;
  421. py_interval["Start"] = interval.start_idx;
  422. py_interval["End"] = interval.end_idx;
  423. py_intervals.append(py_interval);
  424. }
  425. return py_intervals;
  426. }
  427. // void botnet_comm_processor::print_message(const abstract_msg &message){
  428. // std::cout << "Src: " << message.src << " Dst: " << message.dst << " Type: " << message.type << " Time: " << message.time << " LineNumber: " << message.line_no << std::endl;
  429. // }
  430. /*
  431. * Comment out if executable should be build & run
  432. * Comment in if library should be build
  433. */
  434. using namespace boost::python;
  435. BOOST_PYTHON_MODULE (libbotnetcomm) {
  436. class_<botnet_comm_processor>("botnet_comm_processor")
  437. .def(init<list>())
  438. .def(init<>())
  439. .def("find_interval_from_startidx", &botnet_comm_processor::find_interval_from_startidx)
  440. .def("find_optimal_interval", &botnet_comm_processor::find_optimal_interval)
  441. .def("get_interval_init_ids", &botnet_comm_processor::get_interval_init_ids)
  442. .def("get_messages", &botnet_comm_processor::get_messages)
  443. .def("get_message_count", &botnet_comm_processor::get_message_count)
  444. .def("parse_csv", &botnet_comm_processor::parse_csv)
  445. .def("parse_xml", &botnet_comm_processor::parse_xml)
  446. .def("set_messages", &botnet_comm_processor::set_messages)
  447. .def("write_xml", &botnet_comm_processor::write_xml);
  448. }