final.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import numpy as np
  2. import glob
  3. from tslearn.clustering import KShape
  4. from sklearn.cluster import KMeans
  5. import tslearn.metrics as metrics
  6. from sklearn.metrics import calinski_harabasz_score
  7. from tslearn.preprocessing import TimeSeriesScalerMeanVariance
  8. from tslearn.clustering import TimeSeriesKMeans
  9. import matplotlib.pyplot as plt
  10. #from tslearn.generators import random_walks
  11. import tslearn.metrics as metrics
  12. import pandas as pd
  13. from sklearn.model_selection import train_test_split
  14. import warnings
  15. from tslearn.clustering import silhouette_score
  16. import os
  17. from sklearn import metrics
  18. import random
  19. seed = 0
  20. def filter():
  21. GoodNameList = np.loadtxt('/home/it/middle_data/good_name_list.txt',dtype=int)
  22. X=np.array(pd.read_csv('/home/it/middle_data/value.csv',header=None))
  23. threshold_1=19
  24. threshold_2=20
  25. t1 = np.empty([len(X),len(X)], dtype = int)
  26. t2 = np.empty([len(X),len(X)], dtype = int)
  27. i=0
  28. while i<len(X):
  29. j=i
  30. while j<len(X):
  31. a=X[i]
  32. b=X[j]
  33. m1_num=np.sum(a==b)
  34. if m1_num>=threshold_1:#close
  35. t2[i][j]=t2[j][i]=1
  36. else:
  37. t2[i][j]=t2[j][i]=0#far
  38. t1[i][j]=t1[j][i]=m1_num
  39. j=j+1
  40. i=i+1
  41. outlier = np.empty([len(X)], dtype = int)
  42. k=0
  43. while k<len(X):
  44. if np.sum(t2[k])>=threshold_2:
  45. outlier[k]=0#NOT outlier
  46. else:
  47. outlier[k]=1#outlier
  48. k=k+1
  49. need_be_filtered=np.argwhere(outlier>=1).copy()
  50. filtered_index=np.concatenate(need_be_filtered)
  51. np.savetxt('/home/it/middle_data/filtered_good_name_list.txt',np.delete(GoodNameList,filtered_index),fmt="%i")
  52. pattern_middle=np.delete(GoodNameList,filtered_index).copy()
  53. df_result = pd.DataFrame(np.delete(X,filtered_index,axis=0).copy())
  54. df_result.to_csv('/home/it/middle_data/filtered_valus.csv',header=None,index=False)
  55. X=pd.read_csv('/home/it/middle_data/filtered_valus.csv',header=None)
  56. x_original=X.copy()
  57. x_pattern=np.copy(X)
  58. seed = 2
  59. num_cluster = 13
  60. X = TimeSeriesScalerMeanVariance(mu= 0.0 ,std= 1.0 ).fit_transform(X)
  61. ks = KShape(n_clusters= num_cluster ,n_init= 10 ,max_iter=200,tol=1e-8,verbose= True ,random_state=seed)
  62. y_pred = ks.fit_predict(X)
  63. y_pattern=y_pred.copy()
  64. final_y_pred=y_pred.copy()
  65. f4 = open("/home/it/middle_data/filtered_good_name_list.txt","a+") # open in `w` mode to write
  66. i=0
  67. while i<len(filtered_index):
  68. f4.write(str(GoodNameList[filtered_index[i]])) # concatenate the contents
  69. final_y_pred=np.append(final_y_pred,num_cluster)
  70. f4.write("\n")
  71. i=i+1
  72. f4.close()
  73. np.savetxt('/home/it/middle_data/groupingResult.txt',final_y_pred,fmt="%i")
  74. #score function
  75. score_result=metrics.calinski_harabasz_score(x_original, y_pred)
  76. score_result_path="/home/it/middle_data/score_result_kshape.txt"
  77. f = open(score_result_path,"a+") # open in `w` mode to write
  78. f.write("\n")
  79. f.write("k="+str(num_cluster))
  80. f.write("\n")
  81. f.write("CA="+str(score_result)) # concatenate the contents
  82. f.close()
  83. pattern=np.zeros([num_cluster,len(X[0])], dtype = int)
  84. count= np.zeros([1,num_cluster],dtype = int)#用于计相加数,每个group的数量,看跟实际分组数量是否对得上
  85. i=0
  86. while i<len(y_pattern):
  87. pattern[y_pattern[i]]=pattern[y_pattern[i]]+x_pattern[i]
  88. count[0][y_pattern[i]]=count[0][y_pattern[i]]+1
  89. i=i+1
  90. final_result=np.zeros([num_cluster,len(X[0])], dtype = float)
  91. i=0
  92. while i<num_cluster:
  93. final_result[i]=np.true_divide(pattern[i],float(count[0][i]))
  94. i=i+1
  95. i=0
  96. while i<len(final_result):
  97. j=0
  98. while j<len(final_result[0]):
  99. if final_result[i][j]>=0.3:
  100. final_result[i][j]=1
  101. else:
  102. final_result[i][j]=0
  103. j=j+1
  104. i=i+1
  105. #repeat fixed scheduler
  106. middle_final_result=np.copy(final_result)
  107. c=np.concatenate((middle_final_result,final_result),axis=1)
  108. c=np.concatenate((middle_final_result,final_result),axis=1)
  109. c=np.concatenate((middle_final_result,final_result),axis=1)
  110. c=np.concatenate((middle_final_result,final_result),axis=1)
  111. c=np.concatenate((middle_final_result,final_result),axis=1)
  112. c=np.concatenate((middle_final_result,final_result),axis=1)
  113. np.savetxt('/home/it/middle_data/pattern.txt',c,fmt="%i")
  114. warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=931)
  115. filter()