kshape_filter.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. import time
  2. import numpy as np
  3. import pandas as pd
  4. from tslearn.clustering import KShape
  5. from clustering.util import Constants
  6. # json to df
  7. def json2df(json_list):
  8. """
  9. Process incoming json and convert to dataframe for clustering
  10. Input:
  11. json_list: Each element is a list of Json, which is used to store the current processed data
  12. """
  13. return pd.DataFrame(json_list)
  14. class KShapeFilter:
  15. def __init__(self, iter_hour: str):
  16. """
  17. Initialize the KShape algorithm, pass in json to process data, need to re-initialize each round
  18. :param iter_hour: string, %Y-%m-%d %H
  19. """
  20. self.start_time = time.time()
  21. self.hour = iter_hour
  22. def load_data(self, json_list):
  23. self.train_data = pd.DataFrame(json_list)
  24. self.train_data.loc[:, "hour"] = self.hour
  25. def feature_extract(self):
  26. self.train_data.loc[:, "time"] = pd.to_datetime(
  27. self.train_data.loc[:, "timestamp"], unit="s"
  28. )
  29. # store user information
  30. self.user_list = self.train_data.loc[:, "user_id"].unique()
  31. df = self.train_data.reset_index(drop=True)
  32. df.loc[:, "event"] = 1
  33. start_time = df.loc[0, "hour"] + ":00:00"
  34. end_time = df.loc[0, "hour"] + ":59:59"
  35. feature_list = []
  36. for i, user in enumerate(self.user_list):
  37. time_df = pd.DataFrame(
  38. {"time": pd.date_range(start=start_time, end=end_time, freq="s")},
  39. index=range(0, 3600),
  40. )
  41. time_df = pd.merge(
  42. time_df,
  43. df.loc[df.loc[:, "user_id"] == user, ["time", "event"]],
  44. on="time",
  45. how="left",
  46. )
  47. time_df = time_df.fillna(0)
  48. time_array = np.array(time_df.loc[:, "event"])
  49. feature_list.append(time_array)
  50. self.train_X = np.array(feature_list).reshape([-1, 3600, 1])
  51. def train_predict(self, thresholds=0.05, num_cluster=Constants.CLUSTER_NUM):
  52. ks, y_pred = self.k_shape(self.train_X, num_cluster)
  53. self.res = pd.DataFrame({"user_id": self.user_list})
  54. self.res.loc[:, "predict_label"] = y_pred
  55. cluster_count = self.res.loc[:, "predict_label"].value_counts()
  56. print(cluster_count)
  57. judging_df = self.res.loc[:, "predict_label"].value_counts() / len(
  58. self.user_list
  59. )
  60. abnormal_label_list = judging_df[judging_df <= thresholds].index.tolist()
  61. self.abnormal_user = self.res.loc[
  62. self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
  63. ].tolist()
  64. self.normal_user = self.res.loc[
  65. ~self.res.loc[:, "predict_label"].isin(abnormal_label_list), "user_id"
  66. ].tolist()
  67. return self.normal_user, self.abnormal_user
  68. @staticmethod
  69. def k_shape(data, num_cluster):
  70. """
  71. k-shape clustering
  72. :param df: time series dataset
  73. :param num_cluster:
  74. :return:cluster label
  75. """
  76. ks = KShape(
  77. n_clusters=num_cluster, verbose=True, random_state=np.random.seed(0)
  78. )
  79. y_pred = ks.fit_predict(data)
  80. return ks, y_pred