final.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import numpy as np
  2. import glob
  3. from tslearn.clustering import KShape
  4. from sklearn.cluster import KMeans
  5. import tslearn.metrics as metrics
  6. from sklearn.metrics import calinski_harabasz_score
  7. from tslearn.preprocessing import TimeSeriesScalerMeanVariance
  8. from tslearn.clustering import TimeSeriesKMeans
  9. import matplotlib.pyplot as plt
  10. #from tslearn.generators import random_walks
  11. import tslearn.metrics as metrics
  12. import pandas as pd
  13. from sklearn.model_selection import train_test_split
  14. import warnings
  15. from tslearn.clustering import silhouette_score
  16. import os
  17. from sklearn import metrics
  18. import random
  19. seed = 0
  20. def filter():
  21. GoodNameList = np.loadtxt('/home/it/middle_data/good_name_list.txt',dtype=int)
  22. X=np.array(pd.read_csv('/home/it/middle_data/value.csv',header=None))
  23. print("X=",X,type(X),np.shape(X))
  24. threshold_1=19
  25. threshold_2=20
  26. t1 = np.empty([len(X),len(X)], dtype = int)
  27. t2 = np.empty([len(X),len(X)], dtype = int)
  28. i=0
  29. while i<len(X):
  30. j=i
  31. while j<len(X):
  32. a=X[i]
  33. b=X[j]
  34. m1_num=np.sum(a==b)
  35. if m1_num>=threshold_1:#close
  36. t2[i][j]=t2[j][i]=1
  37. else:
  38. t2[i][j]=t2[j][i]=0#far
  39. t1[i][j]=t1[j][i]=m1_num
  40. j=j+1
  41. i=i+1
  42. outlier = np.empty([len(X)], dtype = int)
  43. k=0
  44. while k<len(X):
  45. if np.sum(t2[k])>=threshold_2:
  46. outlier[k]=0#NOT outlier
  47. else:
  48. outlier[k]=1#outlier
  49. k=k+1
  50. need_be_filtered=np.argwhere(outlier>=1).copy()
  51. filtered_index=np.concatenate(need_be_filtered)
  52. np.savetxt('/home/it/middle_data/filtered_good_name_list.txt',np.delete(GoodNameList,filtered_index),fmt="%i")
  53. pattern_middle=np.delete(GoodNameList,filtered_index).copy()
  54. df_result = pd.DataFrame(np.delete(X,filtered_index,axis=0).copy())
  55. df_result.to_csv('/home/it/middle_data/filtered_valus.csv',header=None,index=False)
  56. print("group")
  57. X=pd.read_csv('/home/it/middle_data/filtered_valus.csv',header=None)
  58. x_original=X.copy()
  59. x_pattern=np.copy(X)
  60. seed = 2
  61. num_cluster = 13
  62. X = TimeSeriesScalerMeanVariance(mu= 0.0 ,std= 1.0 ).fit_transform(X)
  63. ks = KShape(n_clusters= num_cluster ,n_init= 10 ,max_iter=200,tol=1e-8,verbose= True ,random_state=seed)
  64. y_pred = ks.fit_predict(X)
  65. y_pattern=y_pred.copy()
  66. final_y_pred=y_pred.copy()
  67. f4 = open("/home/it/middle_data/filtered_good_name_list.txt","a+") # open in `w` mode to write
  68. i=0
  69. while i<len(filtered_index):
  70. f4.write(str(GoodNameList[filtered_index[i]])) # concatenate the contents
  71. final_y_pred=np.append(final_y_pred,num_cluster)
  72. f4.write("\n")
  73. i=i+1
  74. f4.close()
  75. np.savetxt('/home/it/middle_data/groupingResult.txt',final_y_pred,fmt="%i")
  76. #score function
  77. score_result=metrics.calinski_harabasz_score(x_original, y_pred)
  78. score_result_path="/home/it/middle_data/score_result_kshape.txt"
  79. f = open(score_result_path,"a+") # open in `w` mode to write
  80. f.write("\n")
  81. f.write("k="+str(num_cluster))
  82. f.write("\n")
  83. f.write("CA="+str(score_result)) # concatenate the contents
  84. f.close()
  85. pattern=np.zeros([num_cluster,len(X[0])], dtype = int)
  86. count= np.zeros([1,num_cluster],dtype = int)#用于计相加数,每个group的数量,看跟实际分组数量是否对得上
  87. i=0
  88. while i<len(y_pattern):
  89. #print("final_test",pattern[y_pattern[i]],x_pattern[i])
  90. pattern[y_pattern[i]]=pattern[y_pattern[i]]+x_pattern[i]
  91. count[0][y_pattern[i]]=count[0][y_pattern[i]]+1
  92. i=i+1
  93. final_result=np.zeros([num_cluster,len(X[0])], dtype = float)
  94. i=0
  95. while i<num_cluster:
  96. final_result[i]=np.true_divide(pattern[i],float(count[0][i]))
  97. i=i+1
  98. print("final_result=",final_result)
  99. i=0
  100. while i<len(final_result):
  101. j=0
  102. while j<len(final_result[0]):
  103. if final_result[i][j]>=0.3:
  104. final_result[i][j]=1
  105. else:
  106. final_result[i][j]=0
  107. j=j+1
  108. i=i+1
  109. #repeat the fixed scheduler
  110. middle_final_result=np.copy(final_result)
  111. k=0
  112. while k<13:
  113. middle_final_result=np.concatenate((middle_final_result,final_result),axis=1)
  114. k=k+1
  115. np.savetxt('/home/it/middle_data/pattern.txt',middle_final_result,fmt="%i")
  116. warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=931)
  117. filter()