import time import numpy as np import pandas as pd from tslearn.clustering import KShape from clustering.util import Constants # json to df def json2df(json_list): """ Process incoming json and convert to dataframe for clustering Input: json_list: Each element is a list of Json, which is used to store the current processed data """ return pd.DataFrame(json_list) class KShapeFilter: def __init__(self, iter_hour: str): """ Initialize the KShape algorithm, pass in json to process data, need to re-initialize each round :param iter_hour: string, %Y-%m-%d %H """ self.start_time = time.time() self.hour = iter_hour def load_data(self, json_list): self.train_data = pd.DataFrame(json_list) self.train_data.loc[:, "hour"] = self.hour def feature_extract(self): self.train_data.loc[:, "time"] = pd.to_datetime( self.train_data.loc[:, "timestamp"], unit="s" ) # store user information self.user_list = self.train_data.loc[:, "user_id"].unique() df = self.train_data.reset_index(drop=True) df.loc[:, "event"] = 1 start_time = df.loc[0, "hour"] + ":00:00" end_time = df.loc[0, "hour"] + ":59:59" feature_list = [] for i, user in enumerate(self.user_list): time_df = pd.DataFrame( {"time": pd.date_range(start=start_time, end=end_time, freq="s")}, index=range(0, 3600), ) time_df = pd.merge( time_df, df.loc[df.loc[:, "user_id"] == user, ["time", "event"]], on="time", how="left", ) time_df = time_df.fillna(0) time_array = np.array(time_df.loc[:, "event"]) feature_list.append(time_array) self.train_X = np.array(feature_list).reshape([-1, 3600, 1]) def train_predict(self, thresholds=0.05, num_cluster=Constants.CLUSTER_NUM): ks, y_pred = self.k_shape(self.train_X, num_cluster) self.res = pd.DataFrame({"user_id": self.user_list}) self.res.loc[:, "predict_label"] = y_pred cluster_count = self.res.loc[:, "predict_label"].value_counts() print(cluster_count) judging_df = self.res.loc[:, "predict_label"].value_counts() / len( self.user_list ) abnormal_label_list = judging_df[judging_df <= thresholds].index.tolist() self.abnormal_user = self.res.loc[ self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id" ].tolist() self.normal_user = self.res.loc[ ~self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id" ].tolist() return self.normal_user, self.abnormal_user @staticmethod def k_shape(data, num_cluster): """ k-shape clustering :param df: time series dataset :param num_cluster: :return:cluster label """ ks = KShape( n_clusters=num_cluster, verbose=True, random_state=np.random.seed(0) ) y_pred = ks.fit_predict(data) return ks, y_pred