123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- import time
- import numpy as np
- import pandas as pd
- from tslearn.clustering import KShape
- from clustering.util import Constants
- # json to df
- def json2df(json_list):
- """
- Process incoming json and convert to dataframe for clustering
- Input:
- json_list: Each element is a list of Json, which is used to store the current processed data
- """
- return pd.DataFrame(json_list)
- class KShapeFilter:
- def __init__(self, iter_hour: str):
- """
- Initialize the KShape algorithm, pass in json to process data, need to re-initialize each round
- :param iter_hour: string, %Y-%m-%d %H
- """
- self.start_time = time.time()
- self.hour = iter_hour
- def load_data(self, json_list):
- self.train_data = pd.DataFrame(json_list)
- self.train_data.loc[:, "hour"] = self.hour
- def feature_extract(self):
- self.train_data.loc[:, "time"] = pd.to_datetime(
- self.train_data.loc[:, "timestamp"], unit="s"
- )
- # store user information
- self.user_list = self.train_data.loc[:, "user_id"].unique()
- df = self.train_data.reset_index(drop=True)
- df.loc[:, "event"] = 1
- start_time = df.loc[0, "hour"] + ":00:00"
- end_time = df.loc[0, "hour"] + ":59:59"
- feature_list = []
- for i, user in enumerate(self.user_list):
- time_df = pd.DataFrame(
- {"time": pd.date_range(start=start_time, end=end_time, freq="s")},
- index=range(0, 3600),
- )
- time_df = pd.merge(
- time_df,
- df.loc[df.loc[:, "user_id"] == user, ["time", "event"]],
- on="time",
- how="left",
- )
- time_df = time_df.fillna(0)
- time_array = np.array(time_df.loc[:, "event"])
- feature_list.append(time_array)
- self.train_X = np.array(feature_list).reshape([-1, 3600, 1])
- def train_predict(self, thresholds=0.05, num_cluster=Constants.CLUSTER_NUM):
- ks, y_pred = self.k_shape(self.train_X, num_cluster)
- self.res = pd.DataFrame({"user_id": self.user_list})
- self.res.loc[:, "predict_label"] = y_pred
- cluster_count = self.res.loc[:, "predict_label"].value_counts()
- print(cluster_count)
- judging_df = self.res.loc[:, "predict_label"].value_counts() / len(
- self.user_list
- )
- abnormal_label_list = judging_df[judging_df <= thresholds].index.tolist()
- self.abnormal_user = self.res.loc[
- self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
- ].tolist()
- self.normal_user = self.res.loc[
- ~self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
- ].tolist()
- return self.normal_user, self.abnormal_user
- @staticmethod
- def k_shape(data, num_cluster):
- """
- k-shape clustering
- :param df: time series dataset
- :param num_cluster:
- :return:cluster label
- """
- ks = KShape(
- n_clusters=num_cluster, verbose=True, random_state=np.random.seed(0)
- )
- y_pred = ks.fit_predict(data)
- return ks, y_pred
|