#88 Improvements for named queries

Merged
carlos.garcia merged 11 commits from stefan.schmidt/pyparsing_improved into SPIN/master 6 years ago
4 changed files with 130 additions and 21 deletions
  1. 18 2
      code/Core/Controller.py
  2. 59 8
      code/Core/QueryParser.py
  3. 52 10
      code/Core/StatsDatabase.py
  4. 1 1
      code/Test/test_Queries.py

+ 18 - 2
code/Core/Controller.py

@@ -8,6 +8,7 @@ import Core.LabelManager as LabelManager
 import Core.Statistics as Statistics
 import ID2TLib.PcapFile as PcapFile
 import ID2TLib.Utility as Util
+import Core.StatsDatabase as StatsDB
 
 
 class Controller:
@@ -176,7 +177,7 @@ class Controller:
             print()
         elif param == "least_used":
             print("least_used can be used as a selector for the following attributes:")
-            print("ipAddress | macAddress | portNumber | protocolName | ttlValue")
+            print("ipAddress | macAddress | portNumber | protocolName | ttlValue | mssValue | winSize | ipClass")
             print()
         elif param == "avg":
             print("avg can be used as a selector for the following attributes:")
@@ -184,7 +185,7 @@ class Controller:
             print()
         elif param == "all":
             print("all can be used as a selector for the following attributes:")
-            print("ipAddress | ttlValue | mss | macAddress | portNumber | protocolName")
+            print("ipAddress | ttlValue | mss | macAddress | portNumber | protocolName | winSize | ipClass")
             print()
         elif param in ["random", "first", "last"]:
             print("No additional info available for this keyword.")
@@ -197,6 +198,14 @@ class Controller:
                   "macAddress | ttlValue | ttlCount | portDirection | portNumber | portCount | protocolCount\n"
                   "protocolName")
             print()
+            print("The following operators can be used:")
+            print("<= | < | = | >= | > | in")
+            print()
+            print("A value can either be a simple values, a list of simple values separated by commas and enclosed "
+                  "in [] brackets, or another query.")
+            print()
+            print("When VALUE is a list (or a query returning a list), the usage of the 'in' operator is mandatory!")
+            print()
             print("See 'help examples;' for usage examples.")
             print()
         elif param == "macaddress":
@@ -205,6 +214,8 @@ class Controller:
             print("The following parameters can be specified:")
             print("ipAddress")
             print()
+            print("See 'help ipAddress' for information on valid operators and values.")
+            print()
             print("See 'help examples;' for usage examples.")
             print()
         elif param == "examples":
@@ -218,6 +229,8 @@ class Controller:
             print("\tSELECT avg(ttlValue) from ip_ttl;")
             print("Get a random IP address from all addresses that sent and received at least 10 packets:")
             print("\trandom(ipAddress(pktsSent > 10, pktsReceived > 10));")
+            print("Get the IP addresses used with one of the MAC addresses in a list:")
+            print("\tipAddress(macAddress in [08:00:27:a3:83:43, 52:54:00:12:35:02]);")
             print()
         else:
             print("Unknown keyword '" + param + "', try 'help;' to get a list of allowed keywords'")
@@ -291,6 +304,9 @@ class Controller:
                         for i in range(1, e.col):
                             sys.stderr.write(" ")
                         sys.stderr.write("^\n\n")
+                    except StatsDB.QueryExecutionException as e:
+                        sys.stderr.write("An error occured: ")
+                        sys.stderr.write(e.args[0] + "\n")
                 buffer = ""
 
         readline.set_history_length(1000)

+ 59 - 8
code/Core/QueryParser.py

@@ -3,24 +3,75 @@ import pyparsing as pp
 
 class QueryParser:
     def __init__(self):
+        """
+        Constructs a parser for all named queries using PyParsing.
+        """
         extractor = pp.Keyword("random") ^ pp.Keyword("first") ^ pp.Keyword("last")
-        selector = pp.Keyword("most_used") ^ pp.Keyword("least_used") ^ pp.Keyword("avg") ^ pp.Keyword("all")
-        attribute = pp.Keyword("ipaddress") ^ pp.Keyword("macaddress") ^ pp.Keyword("portnumber") ^ pp.Keyword("protocolname") ^ pp.Keyword("ttlvalue") ^ pp.Keyword("mssvalue") ^ pp.Keyword("winsize") ^ pp.Keyword("ipclass") ^ pp.Keyword("pktssent") ^ pp.Keyword("pktsreceived") ^ pp.Keyword("mss") ^ pp.Keyword("kbytesreceived") ^ pp.Keyword("kbytessent")
-        simple_selector_query = selector + pp.Suppress("(") + attribute + pp.Suppress(")")
 
-        param_selectors = pp.Keyword("ipaddress").setParseAction(pp.replaceWith("ipaddress_param")) ^ pp.Keyword("macaddress").setParseAction(pp.replaceWith("macaddress_param"))
-        operators = pp.Literal("=") ^ pp.Literal("<=") ^ pp.Literal("<") ^ pp.Literal(">=") ^ pp.Literal(">")
+        # Valid selectors - except "avg", because not all attributes can be combined with it
+        selector_no_avg = pp.Keyword("most_used") ^ pp.Keyword("least_used") ^ pp.Keyword("all")
+
+        # All attributes that cannot be combined with "avg"
+        attributes_no_avg = pp.Keyword("ipaddress") ^ pp.Keyword("macaddress") ^ pp.Keyword("portnumber") ^\
+                            pp.Keyword("protocolname") ^ pp.Keyword("winsize") ^ pp.Keyword("ipclass")
+
+        # All attributes that can be combined with "avg"
+        attributes_avg = pp.Keyword("ttlvalue") ^ pp.Keyword("mssvalue") ^\
+                         pp.Keyword("pktssent") ^ pp.Keyword("pktsreceived") ^ pp.Keyword("mss") ^\
+                         pp.Keyword("kbytesreceived") ^ pp.Keyword("kbytessent")
+
+        # Collection of all attributes for simpler specification
+        attributes_all = attributes_no_avg ^ attributes_avg
+
+        # Simple selector + attribute query, only allowing "avg" with compatible attributes
+        simple_selector_query = (selector_no_avg + pp.Suppress("(") + attributes_all + pp.Suppress(")")) ^\
+                                (pp.Keyword("avg") + pp.Suppress("(") + attributes_avg + pp.Suppress(")"))
+
+        # Selectors for parameterized queries - they are replaced in the result to avoid ambiguity
+        param_selectors = pp.Keyword("ipaddress").setParseAction(pp.replaceWith("ipaddress_param")) ^\
+                          pp.Keyword("macaddress").setParseAction(pp.replaceWith("macaddress_param"))
+
+        # All operators allowed in parameterized queries
+        operators = pp.Literal("<=") ^ pp.Literal("<") ^ pp.Literal("=") ^\
+                    pp.Literal(">=") ^ pp.Literal(">") ^ pp.CaselessLiteral("in")
+
+        # Placeholder for nesting in parameterized queries
         expr = pp.Forward()
-        comparison = pp.Group(attribute + operators + (pp.Word(pp.alphanums + ".:") ^ expr))
+
+        # Simple values for comparisons inside a parameterized query can be alphanumeric plus dot and colon
+        simple_value = pp.Word(pp.alphanums + ".:")
+
+        # Values in parameterized queries can either be simple values, or a list of them.
+        # If it's a list, we insert a "list"-token to be able to distinguish it
+        parameterized_value = simple_value ^\
+                              (pp.Suppress("[") + pp.Group(pp.Empty().addParseAction(pp.replaceWith('list')) +
+                               pp.delimitedList(simple_value)) + pp.Suppress("]"))
+
+        # One "attribute-operator-value" triplet for parameterized queries
+        comparison = pp.Group(attributes_all + operators + (parameterized_value ^ expr))
+
+        # A full parameterized query, consisting of a parameterized selector and a comma-separated list of comparisons
         parameterized_query = param_selectors + pp.Suppress("(") + pp.Group(pp.delimitedList(comparison)) + pp.Suppress(")")
-        # parameterized_query = param_selectors + pp.Suppress("(") + comparison + pp.Suppress(")")
 
+        # Combination of simple and parameterized queries
         all_selector_queries = (simple_selector_query ^ parameterized_query)
+
+        # All queries can be combined with an extractor
         extractor_selector_query = extractor + pp.Suppress("(") + all_selector_queries + pp.Suppress(")")
 
+        # Queries can be used with an extractor or without
         named_query = (extractor_selector_query ^ all_selector_queries)
+
+        # The placeholder can be replaced with any query
         expr << pp.Group(named_query)
+
+        # Make sure all queries end with a semicolon, and we're done
         self.full_query = named_query + pp.Suppress(";")
 
-    def parse_query(self, querystring):
+    def parse_query(self, querystring: str) -> pp.ParseResults:
+        """
+        Parses the passed query with a pre-constructed parser.
+        :param querystring: The named query to be executed
+        :return: A ParseResults-object, which essentially is a list of tokens
+        """
         return self.full_query.parseString(querystring)

+ 52 - 10
code/Core/StatsDatabase.py

@@ -25,6 +25,10 @@ def dict_gen(curs: sqlite3.Cursor):
             yield dict(zip(field_names, row))
 
 
+class QueryExecutionException(Exception):
+    pass
+
+
 class StatsDatabase:
     def __init__(self, db_path: str):
         """
@@ -168,18 +172,48 @@ class StatsDatabase:
         field_types = self.get_field_types('ip_mac', 'ip_ttl', 'ip_ports', 'ip_protocols', 'ip_statistics', 'ip_mac')
         conditions = []
         for key, op, value in param_op_val:
+            # Check whether the value is not a simple value, but another query (or list)
             if isinstance(value, pp.ParseResults):
-                # If we have another query instead of a direct value, execute and replace it
-                value = self._execute_query_list(value)[0][0]
+                if value[0] == "list":
+                    # We have a list, cut the token off and use the remaining elements
+                    value = value[1:]
+
+                    # Lists can only be used with "in"
+                    if op is not "in":
+                        raise QueryExecutionException("List values require the usage of the 'in' operator!")
+                else:
+                    # If we have another query instead of a direct value, execute and replace it
+                    rvalue = self._execute_query_list(value)
+
+                    # Do we have a comparison operator with a multiple-result query?
+                    if op is not "in" and value[0] in ['most_used', 'least_used', 'all', 'ipaddress_param',
+                                                       'macaddress_param']:
+                        raise QueryExecutionException("The extractor '" + value[0] +
+                                                      "' may return more than one result!")
+
+                    # Make value contain a simple list with the results of the query
+                    value = map(lambda x: str(x[0]), rvalue)
+            else:
+                # Make sure value is a list now to simplify handling
+                value = [value]
+
             # this makes sure that TEXT fields are queried by strings,
             # e.g. ipAddress=192.168.178.1 --is-converted-to--> ipAddress='192.168.178.1'
             if field_types.get(key) == 'TEXT':
-                if not str(value).startswith("'") and not str(value).startswith('"'):
-                    value = "'" + value + "'"
+                def ensure_string(x):
+                    if not str(x).startswith("'") and not str(x).startswith('"'):
+                        return "'" + x + "'"
+                    else:
+                        return x
+                value = map(ensure_string, value)
+
+            # If we have more than one value, join them together, separated by commas
+            value = ",".join(map(str, value))
+
             # this replacement is required to remove ambiguity in SQL query
             if key == 'ipAddress':
                 key = 'ip_mac.ipAddress'
-            conditions.append(key + op + str(value))
+            conditions.append(key + " " + op + " (" + str(value) + ")")
 
         where_clause = " AND ".join(conditions)
         query += where_clause
@@ -230,6 +264,9 @@ class StatsDatabase:
         "least_used.winsize": "SELECT winSize FROM (SELECT winSize, SUM(winCount) as occ FROM tcp_win GROUP BY "
                               "winSize) WHERE occ=(SELECT SUM(winCount) as occ FROM tcp_win GROUP BY winSize "
                               "ORDER BY occ ASC LIMIT 1) ORDER BY winSize ASC",
+        "least_used.ipclass": "SELECT ipClass FROM (SELECT ipClass, COUNT(*) as occ from ip_statistics GROUP BY "
+                             "ipClass ORDER BY occ DESC) WHERE occ=(SELECT COUNT(*) as occ from ip_statistics "
+                             "GROUP BY ipClass ORDER BY occ ASC LIMIT 1) ORDER BY ipClass ASC",
         "avg.pktsreceived": "SELECT avg(pktsReceived) from ip_statistics",
         "avg.pktssent": "SELECT avg(pktsSent) from ip_statistics",
         "avg.kbytesreceived": "SELECT avg(kbytesReceived) from ip_statistics",
@@ -241,27 +278,32 @@ class StatsDatabase:
         "all.mss": "SELECT DISTINCT mssValue from tcp_mss ORDER BY mssValue ASC",
         "all.macaddress": "SELECT DISTINCT macAddress from ip_mac ORDER BY macAddress ASC",
         "all.portnumber": "SELECT DISTINCT portNumber from ip_ports ORDER BY portNumber ASC",
-        "all.protocolname": "SELECT DISTINCT protocolName from ip_protocols ORDER BY protocolName ASC"}
+        "all.protocolname": "SELECT DISTINCT protocolName from ip_protocols ORDER BY protocolName ASC",
+        "all.winsize": "SELECT DISTINCT winSize FROM tcp_win ORDER BY winSize ASC",
+        "all.ipclass": "SELECT DISTINCT ipClass FROM ip_statistics ORDER BY ipClass ASC"}
 
     def _execute_query_list(self, query_list):
         """
         Recursively executes a list of named queries. They are of the following form:
-        ['macaddress_param', [['ipaddress', '=', ['most_used', 'ipaddress']]]]
+        ['macaddress_param', [['ipaddress', 'in', ['most_used', 'ipaddress']]]]
         :param query_list: The query statement list obtained from the query parser
         :return: The result of the query (either a single result or a list).
         """
         if query_list[0] == "random":
-            return rnd.choice(self._execute_query_list(query_list[1:]))
+            return [rnd.choice(self._execute_query_list(query_list[1:]))]
         elif query_list[0] == "first":
-            return self._execute_query_list(query_list[1:])[0]
+            return [self._execute_query_list(query_list[1:])[0]]
         elif query_list[0] == "last":
-            return self._execute_query_list(query_list[1:])[-1]
+            return [self._execute_query_list(query_list[1:])[-1]]
         elif query_list[0] == "macaddress_param":
             return self.named_query_parameterized("macaddress", query_list[1])
         elif query_list[0] == "ipaddress_param":
             return self.named_query_parameterized("ipaddress", query_list[1])
         else:
             query = self.named_queries.get(query_list[0] + "." + query_list[1])
+            if query is None:
+                raise QueryExecutionException("The requested query '" + query_list[0] + "(" + query_list[1] +
+                                              ")' was not found in the internal query list!")
             self.cursor.execute(str(query))
             last_result = self.cursor.fetchall()
             return last_result

+ 1 - 1
code/Test/test_Queries.py

@@ -235,4 +235,4 @@ class TestQueries(unittest.TestCase):
         self.assertEqual(controller.statistics.process_db_query('all(protocolname)'), ['IPv4', 'TCP', 'UDP'])
 
     def test_nested_query(self):
-        self.assertEqual(controller.statistics.process_db_query('macaddress(ipaddress=most_used(ipaddress))'), '08:00:27:a3:83:43')
+        self.assertEqual(controller.statistics.process_db_query('macaddress(ipaddress in most_used(ipaddress))'), '08:00:27:a3:83:43')