StatsDatabase.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. import os.path
  2. import re
  3. import sqlite3
  4. import sys
  5. from random import randint
  6. def dict_gen(curs: sqlite3.Cursor):
  7. """
  8. Generates a dictionary of a sqlite3.Cursor object by fetching the query's results.
  9. Taken from Python Essential Reference by David Beazley.
  10. """
  11. field_names = [d[0] for d in curs.description]
  12. while True:
  13. rows = curs.fetchmany()
  14. if not rows:
  15. return
  16. for row in rows:
  17. yield dict(zip(field_names, row))
  18. class StatsDatabase:
  19. def __init__(self, db_path: str):
  20. """
  21. Creates a new StatsDatabase.
  22. :param db_path: The path to the database file
  23. """
  24. self.existing_db = os.path.exists(db_path)
  25. self.database = sqlite3.connect(db_path)
  26. self.cursor = self.database.cursor()
  27. # If DB not existing, create a new DB scheme
  28. if self.existing_db:
  29. print('Located statistics database at: ', db_path)
  30. else:
  31. print('Statistics database not found. Creating new database at: ', db_path)
  32. def get_file_info(self):
  33. """
  34. Retrieves general file statistics from the database. This includes:
  35. - packetCount : Number of packets in the PCAP file
  36. - captureDuration : Duration of the packet capture in seconds
  37. - timestampFirstPacket : Timestamp of the first captured packet
  38. - timestampLastPacket : Timestamp of the last captured packet
  39. - avgPacketRate : Average packet rate
  40. - avgPacketSize : Average packet size
  41. - avgPacketsSentPerHost : Average number of packets sent per host
  42. - avgBandwidthIn : Average incoming bandwidth
  43. - avgBandwidthOut : Average outgoing bandwidth
  44. :return: a dictionary of keys (see above) and their respective values
  45. """
  46. return [r for r in dict_gen(
  47. self.cursor.execute('SELECT * FROM file_statistics'))][0]
  48. def get_db_exists(self):
  49. """
  50. :return: True if the database was already existent, otherwise False
  51. """
  52. return self.existing_db
  53. @staticmethod
  54. def _get_selector_keywords():
  55. """
  56. :return: a list of selector keywords
  57. """
  58. return ['most_used', 'least_used', 'avg', 'all']
  59. @staticmethod
  60. def _get_parametrized_selector_keywords():
  61. """
  62. :return: a list of parameterizable selector keywords
  63. """
  64. return ['ipaddress', 'macaddress']
  65. @staticmethod
  66. def _get_extractor_keywords():
  67. """
  68. :return: a list of extractor keywords
  69. """
  70. return ['random', 'first', 'last']
  71. def get_all_named_query_keywords(self):
  72. """
  73. :return: a list of all named query keywords, used to identify named queries
  74. """
  75. return (
  76. self._get_selector_keywords() + self._get_parametrized_selector_keywords() + self._get_extractor_keywords())
  77. @staticmethod
  78. def get_all_sql_query_keywords():
  79. """
  80. :return: a list of all supported SQL keywords, used to identify SQL queries
  81. """
  82. return ["select", "insert"]
  83. def _process_user_defined_query(self, query_string: str, query_parameters: tuple = None):
  84. """
  85. Takes as input a SQL query query_string and optional a tuple of parameters which are marked by '?' in the query
  86. and later substituted.
  87. :param query_string: The query to execute
  88. :param query_parameters: The tuple of parameters to inject into the query
  89. :return: the results of the query
  90. """
  91. if query_parameters is not None:
  92. self.cursor.execute(query_string, query_parameters)
  93. else:
  94. self.cursor.execute(query_string)
  95. self.database.commit()
  96. return self.cursor.fetchall()
  97. def get_field_types(self, *table_names):
  98. """
  99. Creates a dictionary whose keys are the fields of the given table(s) and whose values are the appropriate field
  100. types, like TEXT for strings and REAL for float numbers.
  101. :param table_names: The name of table(s)
  102. :return: a dictionary of {field_name : field_type} for fields of all tables
  103. """
  104. dic = {}
  105. for table in table_names:
  106. self.cursor.execute("PRAGMA table_info('%s')" % table)
  107. results = self.cursor.fetchall()
  108. for field in results:
  109. dic[field[1].lower()] = field[2]
  110. return dic
  111. def named_query_parameterized(self, keyword: str, param_op_val: list):
  112. """
  113. Executes a parameterizable named query.
  114. :param keyword: The query to be executed, like ipaddress or macadress
  115. :param param_op_val: A list consisting of triples with (parameter, operator, value)
  116. :return: the results of the executed query
  117. """
  118. named_queries = {
  119. "ipaddress": "SELECT DISTINCT ip_statistics.ipAddress from ip_statistics INNER JOIN ip_mac, ip_ttl, ip_ports, ip_protocols ON ip_statistics.ipAddress=ip_mac.ipAddress AND ip_statistics.ipAddress=ip_ttl.ipAddress AND ip_statistics.ipAddress=ip_ports.ipAddress AND ip_statistics.ipAddress=ip_protocols.ipAddress WHERE ",
  120. "macaddress": "SELECT DISTINCT macAddress from ip_mac WHERE "}
  121. query = named_queries.get(keyword)
  122. field_types = self.get_field_types('ip_mac', 'ip_ttl', 'ip_ports', 'ip_protocols', 'ip_statistics', 'ip_mac')
  123. conditions = []
  124. for key, op, value in param_op_val:
  125. # this makes sure that TEXT fields are queried by strings,
  126. # e.g. ipAddress=192.168.178.1 --is-converted-to--> ipAddress='192.168.178.1'
  127. if field_types.get(key) == 'TEXT':
  128. if not str(value).startswith("'") and not str(value).startswith('"'):
  129. value = "'" + value + "'"
  130. # this replacement is required to remove ambiguity in SQL query
  131. if key == 'ipAddress':
  132. key = 'ip_mac.ipAddress'
  133. conditions.append(key + op + str(value))
  134. where_clause = " AND ".join(conditions)
  135. query += where_clause
  136. self.cursor.execute(query)
  137. return self.cursor.fetchall()
  138. def _process_named_query(self, query_param_list):
  139. """
  140. Executes a named query.
  141. :param query_param_list: A query list consisting of (keyword, params), e.g. [(most_used, ipAddress), (random,)]
  142. :return: the result of the query
  143. """
  144. # Definition of SQL queries associated to named queries
  145. named_queries = {
  146. "most_used.ipaddress": "SELECT ipAddress FROM ip_statistics WHERE (pktsSent+pktsReceived) == (SELECT MAX(pktsSent+pktsReceived) from ip_statistics) ORDER BY ipAddress ASC",
  147. "most_used.macaddress": "SELECT macAddress FROM (SELECT macAddress, COUNT(*) as occ from ip_mac GROUP BY macAddress) WHERE occ=(SELECT COUNT(*) as occ from ip_mac GROUP BY macAddress ORDER BY occ DESC LIMIT 1) ORDER BY macAddress ASC",
  148. "most_used.portnumber": "SELECT portNumber FROM ip_ports GROUP BY portNumber HAVING COUNT(portNumber)=(SELECT MAX(cntPort) from (SELECT portNumber, COUNT(portNumber) as cntPort FROM ip_ports GROUP BY portNumber)) ORDER BY portNumber ASC",
  149. "most_used.protocolname": "SELECT protocolName FROM ip_protocols GROUP BY protocolName HAVING COUNT(protocolCount)=(SELECT COUNT(protocolCount) as cnt FROM ip_protocols GROUP BY protocolName ORDER BY cnt DESC LIMIT 1) ORDER BY protocolName ASC",
  150. "most_used.ttlvalue": "SELECT ttlValue FROM (SELECT ttlValue, SUM(ttlCount) as occ FROM ip_ttl GROUP BY ttlValue) WHERE occ=(SELECT SUM(ttlCount) as occ FROM ip_ttl GROUP BY ttlValue ORDER BY occ DESC LIMIT 1) ORDER BY ttlValue ASC",
  151. "most_used.mssvalue": "SELECT mssValue FROM (SELECT mssValue, SUM(mssCount) as occ FROM tcp_mss GROUP BY mssValue) WHERE occ=(SELECT SUM(mssCount) as occ FROM tcp_mss GROUP BY mssValue ORDER BY occ DESC LIMIT 1) ORDER BY mssValue ASC",
  152. "most_used.winsize": "SELECT winSize FROM (SELECT winSize, SUM(winCount) as occ FROM tcp_win GROUP BY winSize) WHERE occ=(SELECT SUM(winCount) as occ FROM tcp_win GROUP BY winSize ORDER BY occ DESC LIMIT 1) ORDER BY winSize ASC",
  153. "most_used.ipclass": "SELECT ipClass FROM (SELECT ipClass, COUNT(*) as occ from ip_statistics GROUP BY ipClass ORDER BY occ DESC) WHERE occ=(SELECT COUNT(*) as occ from ip_statistics GROUP BY ipClass ORDER BY occ DESC LIMIT 1) ORDER BY ipClass ASC",
  154. #FIXME ORDER BY ASC ? check queries for os dependency!!
  155. "least_used.ipaddress": "SELECT ipAddress FROM ip_statistics WHERE (pktsSent+pktsReceived) == (SELECT MIN(pktsSent+pktsReceived) from ip_statistics) ORDER BY ipAddress ASC",
  156. "least_used.macaddress": "SELECT macAddress FROM (SELECT macAddress, COUNT(*) as occ from ip_mac GROUP BY macAddress) WHERE occ=(SELECT COUNT(*) as occ from ip_mac GROUP BY macAddress ORDER BY occ ASC LIMIT 1) ORDER BY macAddress ASC",
  157. "least_used.portnumber": "SELECT portNumber FROM ip_ports GROUP BY portNumber HAVING COUNT(portNumber)=(SELECT MIN(cntPort) from (SELECT portNumber, COUNT(portNumber) as cntPort FROM ip_ports GROUP BY portNumber)) ORDER BY portNumber ASC",
  158. "least_used.protocolname": "SELECT protocolName FROM ip_protocols GROUP BY protocolName HAVING COUNT(protocolCount)=(SELECT COUNT(protocolCount) as cnt FROM ip_protocols GROUP BY protocolName ORDER BY cnt ASC LIMIT 1) ORDER BY protocolName ASC",
  159. "least_used.ttlvalue": "SELECT ttlValue FROM (SELECT ttlValue, SUM(ttlCount) as occ FROM ip_ttl GROUP BY ttlValue) WHERE occ=(SELECT SUM(ttlCount) as occ FROM ip_ttl GROUP BY ttlValue ORDER BY occ ASC LIMIT 1) ORDER BY ttlValue ASC",
  160. "least_used.mssvalue": "SELECT mssValue FROM (SELECT mssValue, SUM(mssCount) as occ FROM tcp_mss GROUP BY mssValue) WHERE occ=(SELECT SUM(mssCount) as occ FROM tcp_mss GROUP BY mssValue ORDER BY occ ASC LIMIT 1) ORDER BY mssValue ASC",
  161. "least_used.winsize": "SELECT winSize FROM (SELECT winSize, SUM(winCount) as occ FROM tcp_win GROUP BY winSize) WHERE occ=(SELECT SUM(winCount) as occ FROM tcp_win GROUP BY winSize ORDER BY occ ASC LIMIT 1) ORDER BY winSize ASC",
  162. "avg.pktsreceived": "SELECT avg(pktsReceived) from ip_statistics",
  163. "avg.pktssent": "SELECT avg(pktsSent) from ip_statistics",
  164. "avg.kbytesreceived": "SELECT avg(kbytesReceived) from ip_statistics",
  165. "avg.kbytessent": "SELECT avg(kbytesSent) from ip_statistics",
  166. "avg.ttlvalue": "SELECT avg(ttlValue) from ip_ttl",
  167. "avg.mss": "SELECT avg(mssValue) from tcp_mss",
  168. "all.ipaddress": "SELECT ipAddress from ip_statistics ORDER BY ipAddress ASC",
  169. "all.ttlvalue": "SELECT DISTINCT ttlValue from ip_ttl ORDER BY ttlValue ASC",
  170. "all.mss": "SELECT DISTINCT mssValue from tcp_mss ORDER BY mssValue ASC",
  171. "all.macaddress": "SELECT DISTINCT macAddress from ip_mac ORDER BY macAddress ASC",
  172. "all.portnumber": "SELECT DISTINCT portNumber from ip_ports ORDER BY portNumber ASC",
  173. "all.protocolname": "SELECT DISTINCT protocolName from ip_protocols ORDER BY protocolName ASC"}
  174. # Retrieve values by selectors, if given, reduce results by extractor
  175. last_result = 0
  176. for q in query_param_list:
  177. # if selector, like avg, ttl, is given
  178. if any(e in q[0] for e in self._get_selector_keywords()):
  179. (keyword, param) = q
  180. query = named_queries.get(keyword + "." + param)
  181. self.cursor.execute(str(query))
  182. last_result = self.cursor.fetchall()
  183. # if selector is parametrized, i.e. ipAddress(mac=AA:BB:CC:DD:EE) or macAddress(ipAddress=192.168.178.1)
  184. elif any(e in q[0] for e in self._get_parametrized_selector_keywords()) and any(
  185. o in q[1] for o in ["<", "=", ">", "<=", ">="]):
  186. (keyword, param) = q
  187. # convert string 'paramName1<operator1>paramValue1,paramName2<operator2>paramValue2,...' into list of triples
  188. param_op_val = [(key, op, value) for (key, op, value) in
  189. [re.split("(<=|>=|>|<|=)", x) for x in param.split(",")]]
  190. last_result = self.named_query_parameterized(keyword, param_op_val)
  191. # if extractor, like random, first, last, is given
  192. elif any(e in q[0] for e in self._get_extractor_keywords()) and (
  193. isinstance(last_result, list) or isinstance(last_result, tuple)):
  194. extractor = q[0]
  195. if extractor == 'random':
  196. index = randint(a=0, b=len(last_result) - 1)
  197. last_result = last_result[index]
  198. elif extractor == 'first':
  199. last_result = last_result[0]
  200. elif extractor == 'last':
  201. last_result = last_result[-1]
  202. return last_result
  203. def process_db_query(self, query_string_in: str, print_results=False, sql_query_parameters: tuple = None):
  204. """
  205. Processes a database query. This can either be a standard SQL query or a named query (predefined query).
  206. :param query_string_in: The string containing the query
  207. :param print_results: Indicated whether the results should be printed to terminal (True) or not (False)
  208. :param sql_query_parameters: Parameters for the SQL query (optional)
  209. :return: the results of the query
  210. """
  211. named_query_keywords = self.get_all_named_query_keywords()
  212. # Clean query_string
  213. query_string = query_string_in.lower().lstrip()
  214. # query_string is a user-defined SQL query
  215. result = None
  216. if sql_query_parameters is not None or query_string.startswith("select") or query_string.startswith("insert"):
  217. result = self._process_user_defined_query(query_string, sql_query_parameters)
  218. # query string is a named query -> parse it and pass it to statisticsDB
  219. elif any(k in query_string for k in named_query_keywords) and all(k in query_string for k in ['(', ')']):
  220. # Clean query_string
  221. query_string = query_string.replace(" ", "")
  222. # Validity check: Brackets
  223. brackets_open, brackets_closed = query_string.count("("), query_string.count(")")
  224. if not (brackets_open == brackets_closed):
  225. sys.stderr.write("Bracketing of given query '" + query_string + "' is incorrect.")
  226. # Parse query string into [ (query_keyword1, query_params1), ... ]
  227. delimiter_start, delimiter_end = "(", ")"
  228. kplist = []
  229. current_word = ""
  230. for char in query_string: # process characters one-by-one
  231. # if char is no delimiter, add char to current_word
  232. if char != delimiter_end and char != delimiter_start:
  233. current_word += char
  234. # if a start delimiter was found and the current_word so far is a keyword, add it to kplist
  235. elif char == delimiter_start:
  236. if current_word in named_query_keywords:
  237. kplist.append((current_word,))
  238. current_word = ""
  239. else:
  240. print("ERROR: Unrecognized keyword '" + current_word + "' found. Ignoring query.")
  241. return
  242. # else if characeter is end delimiter and there were no two directly following ending delimiters,
  243. # the current_word must be the parameters of an earlier given keyword
  244. elif char == delimiter_end and len(current_word) > 0:
  245. kplist[-1] += (current_word,)
  246. current_word = ""
  247. result = self._process_named_query(kplist[::-1])
  248. else:
  249. sys.stderr.write(
  250. "Query invalid. Only named queries and SQL SELECT/INSERT allowed. Please check the query's syntax!\n")
  251. return
  252. # If result is tuple/list with single element, extract value from list
  253. requires_extraction = (isinstance(result, list) or isinstance(result, tuple)) and len(result) == 1 and \
  254. (not isinstance(result[0], tuple) or len(result[0]) == 1)
  255. while requires_extraction:
  256. if isinstance(result, list) or isinstance(result, tuple):
  257. result = result[0]
  258. else:
  259. requires_extraction = False
  260. # If tuple of tuples or list of tuples, each consisting of single element is returned,
  261. # then convert it into list of values, because the returned colum is clearly specified by the given query
  262. if (isinstance(result, tuple) or isinstance(result, list)) and all(len(val) == 1 for val in result):
  263. result = [c for c in result for c in c]
  264. # Print results if option print_results is True
  265. if print_results:
  266. if len(result) == 1 and isinstance(result, list):
  267. result = result[0]
  268. print("Query returned 1 record:\n")
  269. for i in range(0, len(result)):
  270. print(str(self.cursor.description[i][0]) + ": " + str(result[i]))
  271. else:
  272. self._print_query_results(query_string_in, result)
  273. return result
  274. def _print_query_results(self, query_string_in: str, result):
  275. """
  276. Prints the results of a query.
  277. Based on http://stackoverflow.com/a/20383011/3017719.
  278. :param query_string_in: The query the results belong to
  279. :param result: The results of the query
  280. """
  281. # Print number of results according to type of result
  282. if isinstance(result, list):
  283. print("Query returned " + str(len(result)) + " records:\n")
  284. else:
  285. print("Query returned 1 record:\n")
  286. # Print query results
  287. if query_string_in.lstrip().upper().startswith(
  288. "SELECT") and result is not None and self.cursor.description is not None:
  289. widths = []
  290. columns = []
  291. tavnit = '|'
  292. separator = '+'
  293. for cd in self.cursor.description:
  294. widths.append(len(cd) + 10)
  295. columns.append(cd[0])
  296. for w in widths:
  297. tavnit += " %-" + "%ss |" % (w,)
  298. separator += '-' * w + '--+'
  299. print(separator)
  300. print(tavnit % tuple(columns))
  301. print(separator)
  302. if isinstance(result, list):
  303. for row in result:
  304. print(tavnit % row)
  305. else:
  306. print(tavnit % result)
  307. print(separator)
  308. else:
  309. print(result)