2 years ago · 261bb70772
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,138 @@
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+wheels/
			
 
				+pip-wheel-metadata/
			
 
				+share/python-wheels/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+MANIFEST
			
 
				+
			
 
				+# PyInstaller
			
 
				+#  Usually these files are written by a python script from a template
			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Installer logs
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.nox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*.cover
			
 
				+*.py,cover
			
 
				+.hypothesis/
			
 
				+.pytest_cache/
			
 
				+
			
 
				+# Translations
			
 
				+*.mo
			
 
				+*.pot
			
 
				+
			
 
				+# Django stuff:
			
 
				+*.log
			
 
				+local_settings.py
			
 
				+db.sqlite3
			
 
				+db.sqlite3-journal
			
 
				+
			
 
				+# Flask stuff:
			
 
				+instance/
			
 
				+.webassets-cache
			
 
				+
			
 
				+# Scrapy stuff:
			
 
				+.scrapy
			
 
				+
			
 
				+# Sphinx documentation
			
 
				+docs/_build/
			
 
				+
			
 
				+# PyBuilder
			
 
				+target/
			
 
				+
			
 
				+# Jupyter Notebook
			
 
				+.ipynb_checkpoints
			
 
				+
			
 
				+# IPython
			
 
				+profile_default/
			
 
				+ipython_config.py
			
 
				+
			
 
				+# pyenv
			
 
				+.python-version
			
 
				+
			
 
				+# pipenv
			
 
				+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
			
 
				+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
			
 
				+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
			
 
				+#   install all needed dependencies.
			
 
				+#Pipfile.lock
			
 
				+
			
 
				+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
			
 
				+__pypackages__/
			
 
				+
			
 
				+# Celery stuff
			
 
				+celerybeat-schedule
			
 
				+celerybeat.pid
			
 
				+
			
 
				+# SageMath parsed files
			
 
				+*.sage.py
			
 
				+
			
 
				+# Environments
			
 
				+.env
			
 
				+.venv
			
 
				+env/
			
 
				+venv/
			
 
				+ENV/
			
 
				+env.bak/
			
 
				+venv.bak/
			
 
				+
			
 
				+# Spyder project settings
			
 
				+.spyderproject
			
 
				+.spyproject
			
 
				+
			
 
				+# Rope project settings
			
 
				+.ropeproject
			
 
				+
			
 
				+# mkdocs documentation
			
 
				+/site
			
 
				+
			
 
				+# mypy
			
 
				+.mypy_cache/
			
 
				+.dmypy.json
			
 
				+dmypy.json
			
 
				+
			
 
				+# Pyre type checker
			
 
				+.pyre/
			
 
				+
			
 
				+# Mac OS file
			
 
				+.DS_Store
			
 
				+
			
 
				+# Pycharm file
			
 
				+.idea/
			
 
				+
			
 
				+# Emacs auto save cache
			
 
				+*.*~
			
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,24 @@
 
				+repos:
			
 
				+-   repo: https://github.com/pre-commit/pre-commit-hooks
			
 
				+    rev: v3.4.0
			
 
				+    hooks:
			
 
				+    - id: check-yaml
			
 
				+    - id: check-added-large-files
			
 
				+      args: [--maxkb=20480]
			
 
				+    - id: check-ast
			
 
				+    - id: check-case-conflict
			
 
				+    - id: debug-statements
			
 
				+    - id: detect-private-key
			
 
				+    - id: end-of-file-fixer
			
 
				+    - id: mixed-line-ending
			
 
				+    - id: trailing-whitespace
			
 
				+-   repo: https://github.com/psf/black
			
 
				+    rev: 20.8b1
			
 
				+    hooks:
			
 
				+    - id: black
			
 
				+      additional_dependencies: ['click==8.0.4']
			
 
				+-   repo: https://github.com/kynan/nbstripout
			
 
				+    rev: 0.3.9
			
 
				+    hooks:
			
 
				+    # strip output from Jupyter and IPython notebooks
			
 
				+    - id: nbstripout
			
--- a/Makefile
+++ b/Makefile
@@ -0,0 +1,4 @@
 
				+.PHONY: format
			
 
				+format:
			
 
				+	pre-commit run --all-files
			
 
				+	python -m black .
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,12 @@
 
				+# Traffic-Analysis_Resistant_Anonymity_Sets
			
 
				+
			
 
				+## Run server by
			
 
				+```
			
 
				+$ flask run --host=0.0.0.0
			
 
				+
			
 
				+```
			
 
				+## Run client by
			
 
				+```
			
 
				+$ python client.py http://127.0.0.1:5000/api/user_info dataset
			
 
				+
			
 
				+```
			
--- a/__init__.py
+++ b/__init__.py
@@ -0,0 +1,6 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+from flask import Flask
			
 
				+
			
 
				+app = Flask(__name__)
			
 
				+app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///db.sqlite"
			
--- a/charting.py
+++ b/charting.py
@@ -0,0 +1,61 @@
 
				+import pandas as pd
			
 
				+import sqlite3
			
 
				+import os
			
 
				+from matplotlib import pyplot as plt
			
 
				+import seaborn as sns
			
 
				+
			
 
				+conn = sqlite3.connect('db.sqlite')
			
 
				+
			
 
				+
			
 
				+def check_folder():
			
 
				+    path = './charts'
			
 
				+    if not os.path.exists(path):
			
 
				+        os.mkdir(path)
			
 
				+        print('create charts directory...')
			
 
				+
			
 
				+def time_series_chart(x, y, y_label, title):
			
 
				+    sns.set()
			
 
				+    plt.figure(figsize=(18,7))
			
 
				+    sns.lineplot(x, y)
			
 
				+    plt.xlabel('Round')
			
 
				+    plt.ylabel(y_label)
			
 
				+    plt.title(title)
			
 
				+    plt.savefig(os.path.join('charts', title + '.png'))
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    check_folder()
			
 
				+    # loading records
			
 
				+    res = pd.read_sql_query(
			
 
				+        '''select timestamp, 
			
 
				+                  normal_user_count,
			
 
				+                  abnormal_user_count,
			
 
				+                  nyms_count
			
 
				+            from cluster_result''',
			
 
				+        con=conn)
			
 
				+
			
 
				+    # plot time-series key indicator fig
			
 
				+
			
 
				+    time_series_chart(res.index, res.loc[:, 'nyms_count'],
			
 
				+                      'Size of Pseudonym',
			
 
				+                      'Attack Anonymity Set')
			
 
				+    time_series_chart(res.index, res.loc[:, 'normal_user_count'],
			
 
				+                      'Forwarded User Counts',
			
 
				+                      'Normal Active User')
			
 
				+    time_series_chart(res.index, res.loc[:, 'abnormal_user_count'],
			
 
				+                      'Delayed User Counts',
			
 
				+                      'Abnormal Active User')
			
 
				+    print('save finished.')
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/client.py
+++ b/client.py
@@ -0,0 +1,54 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import json
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+from encrypt_message import encrypt_message
			
 
				+from .util import Constants
			
 
				+
			
 
				+
			
 
				+class Client(object):
			
 
				+    def __init__(self, server, text_file):
			
 
				+        self.server = server
			
 
				+        if not os.path.exists(text_file):
			
 
				+            raise IOError
			
 
				+        self.text_file = text_file
			
 
				+
			
 
				+    def batch_upload_user_info(self, batch_size=100, gen_cover=0):
			
 
				+
			
 
				+        with open(self.text_file, "r") as f:
			
 
				+            lines = f.readlines()
			
 
				+            batches = [
			
 
				+                lines[x : x + batch_size] for x in range(0, len(lines), batch_size)
			
 
				+            ]
			
 
				+            for batch in batches:
			
 
				+                    body = json.dumps(batch)
			
 
				+                    try:
			
 
				+                        body = encrypt_message(body)
			
 
				+                        r = requests.post(self.server, data=body)
			
 
				+                        print(r.text)
			
 
				+                        if gen_cover:
			
 
				+                            for _ in Constants.NUM_COVER:
			
 
				+                                cover_msg = self.cover_message_gen(body)
			
 
				+                                cover_msg = encrypt_message(cover_msg)
			
 
				+                                r = requests.post(self.server, data=cover_msg)
			
 
				+                                print(r.text)
			
 
				+                    except Exception as e:
			
 
				+                        print(e)
			
 
				+
			
 
				+    def cover_message_gen(self, msg):
			
 
				+        user = msg.user_id
			
 
				+        time = msg.timestamp
			
 
				+        return {"timestamp": time, "user_id": user}
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+
			
 
				+    server = sys.argv[1]
			
 
				+    text_file = sys.argv[2]
			
 
				+    client = Client(server, text_file)
			
 
				+
			
 
				+    client.batch_upload_user_info(batch_size=10000)
			
--- a/cluster.py
+++ b/cluster.py
@@ -0,0 +1,207 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import json
			
 
				+import pandas as pd
			
 
				+from datetime import datetime, timedelta
			
 
				+from random import choice
			
 
				+import hashlib
			
 
				+
			
 
				+import click
			
 
				+from sqlalchemy import delete, func, select, update
			
 
				+
			
 
				+from . import app
			
 
				+from .kshape_filter import KShapeFilter
			
 
				+from .models import Cluster_result, Nym_info, User_info, db
			
 
				+from .util import Constants, cnt_cluster
			
 
				+from .update_ts import fill_ts
			
 
				+
			
 
				+class Clustering:
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.user_list = {}                  # user names
			
 
				+        self.time_series = pd.DataFrame()    # time series dataset, update every round
			
 
				+        self.delay_user_list = []
			
 
				+
			
 
				+    def get_user(self, df):
			
 
				+        """Extract username from received dataframe"""
			
 
				+        users = df['user_id'].drop_duplicates().values.tolist()             # users per round
			
 
				+        return users
			
 
				+
			
 
				+    def update_user(self, user_list):
			
 
				+        """Add new user to user_list"""
			
 
				+        user = dict(zip(user_list, [0]*len(user_list)))
			
 
				+        self.user_list.update(user)
			
 
				+
			
 
				+    def update_time_series(self, df):
			
 
				+        """Add new round of data to the time series"""
			
 
				+        self.time_series = df.combine_first(self.time_series).fillna(0)     # fill NaN data for k-shape
			
 
				+
			
 
				+    def rnd_time_series(self, df, users):
			
 
				+        """Filter out the time series of active users in this round"""
			
 
				+        rnd_df = df.loc[users]
			
 
				+        print(rnd_df.shape)
			
 
				+        return rnd_df
			
 
				+
			
 
				+    def delay_user(self, df, model, cnt_cluster):
			
 
				+        """
			
 
				+        Filter out users who in small cluster
			
 
				+        :param cnt_cluster:  count the amount of users in each cluster
			
 
				+        :param model: cluster model
			
 
				+        """
			
 
				+        cluster_s = cnt_cluster.loc[cnt_cluster['count'] <= Constants.CLUSTER_SIZE].index.tolist()
			
 
				+        new_df = pd.DataFrame()
			
 
				+        for i in cluster_s:
			
 
				+            filter_df = df[model.labels_ == i]
			
 
				+            new_df = pd.concat([filter_df, new_df], ignore_index=False)
			
 
				+        delay_user = new_df.index.tolist()
			
 
				+        if delay_user:
			
 
				+            print(f"{len(delay_user)} messages been delayed, join in next round")
			
 
				+        return delay_user
			
 
				+
			
 
				+    def ts_cluster(self, data):
			
 
				+
			
 
				+        users = self.get_user(data)
			
 
				+        rnd_users = self.delay_user_list + users
			
 
				+        print(f"Participate users: {len(users)} active, {len(self.delay_user_list)} delayed")
			
 
				+        df = fill_ts(data)
			
 
				+        self.update_time_series(df)
			
 
				+        rnd_df = self.rnd_time_series(self.time_series, rnd_users)
			
 
				+
			
 
				+        ks, y_pred = KShapeFilter.k_shape(rnd_df, Constants.CLUSTER_NUM)
			
 
				+        cnt = cnt_cluster(ks)
			
 
				+
			
 
				+        self.delay_user_list = self.delay_user(rnd_df, ks, cnt)
			
 
				+
			
 
				+
			
 
				+    def anonymity_simulation(date, hour, random=True):
			
 
				+        """
			
 
				+        simulate anonymity attack, select one user randomly to update user_info
			
 
				+        :param date: date
			
 
				+        :param hour: hour
			
 
				+        :return: None
			
 
				+        """
			
 
				+        print("attack:", date, hour)
			
 
				+        start_time = datetime.strptime(f"{date} {hour}", "%Y-%m-%d %H")
			
 
				+
			
 
				+        # Two attack methods, specified anonymous attack and random anonymous attack
			
 
				+
			
 
				+        # Specified, random=0
			
 
				+        if random == False:
			
 
				+            attack_user_target = "537073336dcd41ff4f362d111888907c"
			
 
				+
			
 
				+        # Random attack, random=1
			
 
				+        else:
			
 
				+            res = db.session.execute(
			
 
				+                select(User_info.user_id)
			
 
				+                .where(User_info.timestamp <= start_time.strftime("%Y-%m-%d %H:%M:%S"))
			
 
				+                .group_by(User_info.user_id)
			
 
				+            )
			
 
				+
			
 
				+            user_list_json = list(
			
 
				+                map(
			
 
				+                    lambda x: {
			
 
				+                        "user_id": x,
			
 
				+                    },
			
 
				+                    res.scalars().all(),
			
 
				+                )
			
 
				+            )
			
 
				+
			
 
				+            user_list = [item["user_id"] for item in user_list_json]
			
 
				+            attack_user_target = choice(user_list)
			
 
				+
			
 
				+        return attack_user_target
			
 
				+
			
 
				+
			
 
				+    def clustering(self, date, hour, with_attack=0, with_random=1):
			
 
				+        """clustering function"""
			
 
				+        print(date, hour)
			
 
				+        start_time = datetime.strptime(f"{date} {hour}", "%Y-%m-%d %H")
			
 
				+        end_time = start_time + timedelta(hours=1)
			
 
				+        res = db.session.execute(
			
 
				+            select(User_info)
			
 
				+            .where(User_info.timestamp >= start_time.strftime("%Y-%m-%d %H:%M:%S"))
			
 
				+            .where(User_info.timestamp < end_time.strftime("%Y-%m-%d %H:%M:%S"))
			
 
				+        )
			
 
				+        # print(res.scalars().all())
			
 
				+        user_list = list(
			
 
				+            map(
			
 
				+                lambda x: {
			
 
				+                    "user_id": x.user_id,
			
 
				+                    "timestamp": x.timestamp,
			
 
				+                },
			
 
				+                res.scalars().all(),
			
 
				+            )
			
 
				+        )
			
 
				+        # print(user_list)
			
 
				+        print(f"{len(user_list)} users are selected")
			
 
				+
			
 
				+        for user in user_list:
			
 
				+            user_id = user["user_id"]
			
 
				+            # user_id = hashlib.md5(int(user_id).to_bytes(8, 'big')).hexdigest()      # For tweet dataset
			
 
				+            nym = hashlib.md5(user_id.encode("utf-8")).hexdigest()
			
 
				+            user_nym = Nym_info(timestamp=user["timestamp"], nym=nym)
			
 
				+            db.session.merge(user_nym)
			
 
				+        db.session.commit()
			
 
				+
			
 
				+        k_filter = KShapeFilter(iter_hour=f"{date} {hour}")
			
 
				+        k_filter.load_data(user_list)
			
 
				+        k_filter.feature_extract()
			
 
				+        normal_user, abnormal_user = k_filter.train_predict()
			
 
				+
			
 
				+        if with_attack == 1:
			
 
				+            target_user = self.anonymity_simulation(date, hour, random=with_random)
			
 
				+
			
 
				+            if target_user not in normal_user:
			
 
				+                print(f"[Warning!] Anonymity Attack, user:{target_user}")
			
 
				+                normal_user.append(target_user)
			
 
				+                abnormal_user = [item for item in abnormal_user if item != target_user]
			
 
				+                # delete normal user from anonymity user list and record the length of the anonymity user list
			
 
				+                stmt = delete(Nym_info).where(Nym_info.nym.in_(normal_user))
			
 
				+                db.session.execute(stmt)
			
 
				+                db.session.commit()
			
 
				+            else:
			
 
				+                # delete all the users which is not in normal user list
			
 
				+                stmt = delete(Nym_info).where(
			
 
				+                    Nym_info.nym.notin_(normal_user)
			
 
				+                )
			
 
				+                db.session.execute(stmt)
			
 
				+                db.session.commit()
			
 
				+
			
 
				+        stmt = (
			
 
				+            update(Nym_info)
			
 
				+            .where(Nym_info.nym.in_(abnormal_user))
			
 
				+        )
			
 
				+        print(stmt)
			
 
				+        result = db.session.execute(stmt)
			
 
				+        db.session.commit()
			
 
				+
			
 
				+        nyms_count = db.session.query(
			
 
				+            func.count(Nym_info.nym)
			
 
				+        ).scalar()
			
 
				+        print(f"fictitious user count is: {nyms_count}")
			
 
				+        cluster_result = Cluster_result(
			
 
				+            timestamp=start_time,
			
 
				+            normal_user=json.dumps(normal_user),
			
 
				+            abnormal_user=json.dumps(abnormal_user),
			
 
				+            normal_user_count=len(normal_user),
			
 
				+            abnormal_user_count=len(abnormal_user),
			
 
				+            nyms_count=nyms_count,
			
 
				+        )
			
 
				+
			
 
				+        db.session.merge(cluster_result)
			
 
				+        db.session.commit()
			
 
				+        print(f"{len(abnormal_user)} abnormal users")
			
 
				+        return {"status": 1}
			
 
				+
			
 
				+
			
 
				+    @app.cli.command("clustering")
			
 
				+    @click.argument("date")
			
 
				+    @click.argument("hour")
			
 
				+    @click.option("-a", "--attack", required=True, type=int)
			
 
				+    @click.option("-r", "--random", required=True, type=int)
			
 
				+    def clustering_by_date_hour(self, date, hour, attack=0, random=1):
			
 
				+
			
 
				+        print(date, hour)
			
 
				+        print(f"is attack?:{attack}")
			
 
				+        self.clustering(date, hour, with_attack=attack, with_random=random)
			
 
				+        return None
			
--- a/clustering.bat
+++ b/clustering.bat
@@ -0,0 +1,3 @@
 
				+for %%d in (1 2 3 4 5) do (
			
 
				+	for  %%h in (0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23) do flask clustering 2021-10-%%d %%h -a 1 -r 1
			
 
				+)
			
--- a/encrypt_message.py
+++ b/encrypt_message.py
@@ -0,0 +1,34 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import os
			
 
				+
			
 
				+from cryptography.fernet import Fernet
			
 
				+
			
 
				+SECRET_KEY = "secret.key"
			
 
				+
			
 
				+
			
 
				+def generate_key():
			
 
				+    key = Fernet.generate_key()
			
 
				+    with open(SECRET_KEY, "wb") as key_file:
			
 
				+        key_file.write(key)
			
 
				+
			
 
				+
			
 
				+def load_key():
			
 
				+    return open(SECRET_KEY, "rb").read()
			
 
				+
			
 
				+
			
 
				+def encrypt_message(message):
			
 
				+    """Encrypts a message string"""
			
 
				+    if not os.path.exists(SECRET_KEY):
			
 
				+        generate_key()
			
 
				+    key = load_key()
			
 
				+    encoded_message = message.encode("UTF-8")
			
 
				+    f = Fernet(key)
			
 
				+    encrypted_message = f.encrypt(encoded_message)
			
 
				+    return encrypted_message
			
 
				+
			
 
				+
			
 
				+def decrypt_message(encrypted_message):
			
 
				+    key = load_key()
			
 
				+    f = Fernet(key)
			
 
				+    decrypted_message = f.decrypt(encrypted_message)
			
 
				+    return decrypted_message.decode()
			
--- a/kshape_filter.py
+++ b/kshape_filter.py
@@ -0,0 +1,102 @@
 
				+import time
			
 
				+
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from tslearn.clustering import KShape
			
 
				+from .util import Constants
			
 
				+
			
 
				+
			
 
				+# json to df
			
 
				+def json2df(json_list):
			
 
				+    """
			
 
				+    Process incoming json and convert to dataframe for clustering
			
 
				+    Input:
			
 
				+    json_list: Each element is a list of Json, which is used to store the current processed data
			
 
				+    """
			
 
				+    return pd.DataFrame(json_list)
			
 
				+
			
 
				+
			
 
				+class KShapeFilter:
			
 
				+    def __init__(self, iter_hour: str):
			
 
				+        """
			
 
				+        Initialize the KShape algorithm, pass in json to process data, need to re-initialize each round
			
 
				+        :param iter_hour: string, %Y-%m-%d %H
			
 
				+        """
			
 
				+        self.start_time = time.time()
			
 
				+        self.hour = iter_hour
			
 
				+
			
 
				+    def load_data(self, json_list):
			
 
				+
			
 
				+        self.train_data = pd.DataFrame(json_list)
			
 
				+        self.train_data.loc[:, "hour"] = self.hour
			
 
				+
			
 
				+    def feature_extract(self):
			
 
				+
			
 
				+        self.train_data.loc[:, "time"] = pd.to_datetime(
			
 
				+            self.train_data.loc[:, "timestamp"], unit="s"
			
 
				+        )
			
 
				+        # store user information
			
 
				+        self.user_list = self.train_data.loc[:, "user_id"].unique()
			
 
				+
			
 
				+        df = self.train_data.reset_index(drop=True)
			
 
				+        df.loc[:, "event"] = 1
			
 
				+        start_time = df.loc[0, "hour"] + ":00:00"
			
 
				+        end_time = df.loc[0, "hour"] + ":59:59"
			
 
				+
			
 
				+        feature_list = []
			
 
				+        for i, user in enumerate(self.user_list):
			
 
				+            time_df = pd.DataFrame(
			
 
				+                {"time": pd.date_range(start=start_time, end=end_time, freq="s")},
			
 
				+                index=range(0, 3600),
			
 
				+            )
			
 
				+            time_df = pd.merge(
			
 
				+                time_df,
			
 
				+                df.loc[df.loc[:, "user_id"] == user, ["time", "event"]],
			
 
				+                on="time",
			
 
				+                how="left",
			
 
				+            )
			
 
				+            time_df = time_df.fillna(0)
			
 
				+            time_array = np.array(time_df.loc[:, "event"])
			
 
				+            feature_list.append(time_array)
			
 
				+
			
 
				+        self.train_X = np.array(feature_list).reshape([-1, 3600, 1])
			
 
				+
			
 
				+    def train_predict(self, thresholds=0.05, num_cluster=Constants.CLUSTER_NUM):
			
 
				+
			
 
				+        ks, y_pred = self.k_shape(self.train_X, num_cluster)
			
 
				+        self.res = pd.DataFrame({"user_id": self.user_list})
			
 
				+        self.res.loc[:, "predict_label"] = y_pred
			
 
				+
			
 
				+        cluster_count = self.res.loc[:, "predict_label"].value_counts()
			
 
				+        print(cluster_count)
			
 
				+
			
 
				+        judging_df = self.res.loc[:, "predict_label"].value_counts() / len(
			
 
				+            self.user_list
			
 
				+        )
			
 
				+        abnormal_label_list = judging_df[judging_df <= thresholds].index.tolist()
			
 
				+
			
 
				+        self.abnormal_user = self.res.loc[
			
 
				+            self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
			
 
				+        ].tolist()
			
 
				+
			
 
				+        self.normal_user = self.res.loc[
			
 
				+            ~self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
			
 
				+        ].tolist()
			
 
				+
			
 
				+        return self.normal_user, self.abnormal_user
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def k_shape(data, num_cluster):
			
 
				+        """
			
 
				+        k-shape clustering
			
 
				+        :param df: time series dataset
			
 
				+        :param num_cluster:
			
 
				+        :return:cluster label
			
 
				+        """
			
 
				+        ks = KShape(
			
 
				+            n_clusters=num_cluster, verbose=True, random_state=np.random.seed(0)
			
 
				+        )
			
 
				+        y_pred = ks.fit_predict(data)
			
 
				+        return ks, y_pred
			
 
				+
			
 
				+
			
--- a/models.py
+++ b/models.py
@@ -0,0 +1,32 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from flask_sqlalchemy import SQLAlchemy
			
 
				+
			
 
				+from . import app
			
 
				+
			
 
				+db = SQLAlchemy(app)
			
 
				+
			
 
				+
			
 
				+class User_info(db.Model):
			
 
				+
			
 
				+    user_id = db.Column(db.String, primary_key=True)
			
 
				+    timestamp = db.Column(db.DateTime, primary_key=True)
			
 
				+
			
 
				+
			
 
				+
			
 
				+class Cluster_result(db.Model):
			
 
				+
			
 
				+    timestamp = db.Column(db.DateTime, unique=True, primary_key=True)
			
 
				+    normal_user = db.Column(db.Text)
			
 
				+    abnormal_user = db.Column(db.Text)
			
 
				+    normal_user_count = db.Column(db.Integer, default=0)
			
 
				+    abnormal_user_count = db.Column(db.Integer, default=0)
			
 
				+    nyms_count = db.Column(db.Integer, default=0)
			
 
				+
			
 
				+
			
 
				+class Nym_info(db.Model):
			
 
				+
			
 
				+    nym = db.Column(db.String, primary_key=True, unique=True)
			
 
				+    timestamp = db.Column(db.DateTime, primary_key=True)
			
--- a/scheduler.py
+++ b/scheduler.py
@@ -0,0 +1,24 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+from apscheduler.schedules.background import BackgroundScheduler
			
 
				+from flask_apscheduler import APScheduler
			
 
				+
			
 
				+from . import app
			
 
				+
			
 
				+
			
 
				+class SchedulerConfig(object):
			
 
				+    JOBS = [
			
 
				+        {
			
 
				+            "id": "",
			
 
				+            "func": None,
			
 
				+            "args": None,
			
 
				+            "trigger": {
			
 
				+                "type": "cron",
			
 
				+                # 'day_of_week': "0-6", # Define specific days to execute
			
 
				+                "hour": "*/1",
			
 
				+                # "minute": "*/15",
			
 
				+                # 'second': '*/5'  # "*/3" ,execute every 3 seconds
			
 
				+            },
			
 
				+        }
			
 
				+    ]
			
 
				+    SCHEDULER_API_ENABLED = True
			
--- a/server.py
+++ b/server.py
@@ -0,0 +1,43 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import datetime
			
 
				+import json
			
 
				+
			
 
				+from apscheduler.schedulers.background import BackgroundScheduler
			
 
				+from flask import request
			
 
				+
			
 
				+from . import app
			
 
				+from .models import User_info, db
			
 
				+from .encrypt_message import decrypt_message
			
 
				+
			
 
				+db.create_all()
			
 
				+
			
 
				+
			
 
				+def sensor():
			
 
				+    """ Function for test purposes. """
			
 
				+    print("Scheduler is alive!")
			
 
				+
			
 
				+
			
 
				+sched = BackgroundScheduler(daemon=True)
			
 
				+sched.add_job(sensor, "interval", minutes=1)
			
 
				+sched.start()
			
 
				+
			
 
				+
			
 
				+@app.route("/api/user_info", methods=["POST"])
			
 
				+def user_info():
			
 
				+    """ Function for parsing user info"""
			
 
				+    data = request.data
			
 
				+    try:
			
 
				+        data = decrypt_message(data)
			
 
				+        batches = json.loads(data)
			
 
				+        for batch in batches:
			
 
				+            line = json.loads(batch)
			
 
				+            timestamp = datetime.datetime.fromtimestamp(line["timestamp"])
			
 
				+            user_id = line["user_id"]
			
 
				+            user = User_info(timestamp=timestamp, user_id=user_id)
			
 
				+            db.session.merge(user)
			
 
				+        db.session.commit()
			
 
				+    except Exception as e:
			
 
				+        return {"status": 0, "error": str(e)}
			
 
				+    return {"status": 1}
			
 
				+
			
--- a/update_ts.py
+++ b/update_ts.py
@@ -0,0 +1,55 @@
 
				+import json
			
 
				+import csv
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import warnings
			
 
				+warnings.filterwarnings("ignore")
			
 
				+
			
 
				+pd.set_option('display.max_columns', None)
			
 
				+pd.set_option('display.max_rows', None)
			
 
				+pd.set_option('display.width', 1000)
			
 
				+
			
 
				+
			
 
				+def convert_to_csv(filepath):
			
 
				+    """
			
 
				+    convert txt file to csv
			
 
				+    run only once at the beginning
			
 
				+    """
			
 
				+    with open(filepath, 'r') as f, open('docs/reddit_100000.csv', 'w', encoding='utf-8') as csvfile:
			
 
				+        fieldnames = ['timestamp', 'user_id']
			
 
				+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
			
 
				+        writer.writeheader()
			
 
				+        for line in f:
			
 
				+            writer.writerow(json.loads(line))
			
 
				+        print('Successfully convert to CSV')
			
 
				+
			
 
				+
			
 
				+def read_data(filepath):
			
 
				+    """
			
 
				+    read CSV data and parse timestamp
			
 
				+    :return: dataframe named df
			
 
				+    """
			
 
				+    parser = lambda x: pd.to_datetime(x, unit='s')
			
 
				+    df = pd.read_csv(filepath, parse_dates=['timestamp'], date_parser=parser)
			
 
				+
			
 
				+    return df
			
 
				+
			
 
				+def fill_ts(df):
			
 
				+    """
			
 
				+    set values for time series and fill N/A values
			
 
				+    :param data: time series dataframe
			
 
				+    :return: filled dataframe
			
 
				+    """
			
 
				+    df['timestamp'] = df['timestamp'].dt.floor('H')                     # floor timestamp to hour
			
 
				+    df_cnt = df.loc[:, "user_id"].value_counts().to_frame().rename_axis('user_id')
			
 
				+    df_cnt.columns = ['count']
			
 
				+
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    df_cnt.reset_index(drop=True, inplace=True)
			
 
				+    df = pd.merge(df_cnt, df, left_index=True, right_index=True, how='outer').set_index('user_id')
			
 
				+
			
 
				+    df_res = df.pivot_table(index='user_id', columns='timestamp', values='count', aggfunc=np.sum)
			
 
				+    print(df_res.shape)
			
 
				+
			
 
				+    return df_res
			
 
				+
			
--- a/util.py
+++ b/util.py
@@ -0,0 +1,24 @@
 
				+
			
 
				+import pandas as pd
			
 
				+
			
 
				+class Constants:
			
 
				+    CLUSTER_NUM = 100
			
 
				+    ROUND_LENGTH = 3600        # in seconds
			
 
				+    CLUSTER_SIZE = 30       # minimum cluster size that must be satisfied by forwarding messages
			
 
				+    NUM_COVER = 1
			
 
				+    MAX_DELAY = 3
			
 
				+    ENCODING = 'UTF-8'
			
 
				+
			
 
				+def cnt_cluster(model):
			
 
				+    """Count number of users in each cluster"""
			
 
				+    cnt = pd.Series(model.labels_).value_counts().to_frame()
			
 
				+    cnt.columns = ['count']
			
 
				+    print('Non_empty cluster number:', len(cnt))
			
 
				+    print('The number of users in each cluster is：\n', cnt)
			
 
				+    return cnt
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+