123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- import numpy as np
- import glob
- from tslearn.clustering import KShape
- from sklearn.cluster import KMeans
- import tslearn.metrics as metrics
- from sklearn.metrics import calinski_harabasz_score
- from tslearn.preprocessing import TimeSeriesScalerMeanVariance
- from tslearn.clustering import TimeSeriesKMeans
- import matplotlib.pyplot as plt
- #from tslearn.generators import random_walks
- import tslearn.metrics as metrics
- import pandas as pd
- from sklearn.model_selection import train_test_split
- import warnings
- from tslearn.clustering import silhouette_score
- import os
- from sklearn import metrics
- import random
- def filter():
- GoodNameList = np.loadtxt('/home/it/middle_data/good_name_list.txt',dtype=int)
- X=np.array(pd.read_csv('/home/it/middle_data/value.csv',header=None))
- threshold_1=19#
- threshold_2=20
- t1 = np.empty([len(X),len(X)], dtype = int)
- t2 = np.empty([len(X),len(X)], dtype = int)
- i=0
- while i<len(X):
- j=i
- while j<len(X):
- a=X[i]
- b=X[j]
- m1_num=np.sum(a==b)
- if m1_num>=threshold_1:#close
- t2[i][j]=t2[j][i]=1
- else:
- t2[i][j]=t2[j][i]=0#far
- t1[i][j]=t1[j][i]=m1_num
- j=j+1
- i=i+1
- outlier = np.empty([len(X)], dtype = int)
- k=0
- while k<len(X):
- if np.sum(t2[k])>=threshold_2:
- outlier[k]=0#NOT outlier
- else:
- outlier[k]=1#outlier
- k=k+1
- need_be_filtered=np.argwhere(outlier>=1).copy()
- filtered_index=np.concatenate(need_be_filtered)
- np.savetxt('/home/it/middle_data/filtered_good_name_list.txt',np.delete(GoodNameList,filtered_index),fmt="%i")
- pattern_middle=np.delete(GoodNameList,filtered_index).copy()
- df_result = pd.DataFrame(np.delete(X,filtered_index,axis=0).copy())
- df_result.to_csv('/home/it/middle_data/filtered_valus.csv',header=None,index=False)
- X=pd.read_csv('/home/it/middle_data/filtered_valus.csv',header=None)
- x_original=X.copy()
- x_pattern=np.copy(X)
- seed = 2
- num_cluster = 13
- X = TimeSeriesScalerMeanVariance(mu= 0.0 ,std= 1.0 ).fit_transform(X)
- ks = KShape(n_clusters= num_cluster ,n_init= 10 ,max_iter=200,tol=1e-8,verbose= True ,random_state=seed)
- y_pred = ks.fit_predict(X)
- y_pattern=y_pred.copy()
- final_y_pred=y_pred.copy()
- f4 = open("/home/it/middle_data/filtered_good_name_list.txt","a+") # open in `w` mode to write
- i=0
- while i<len(filtered_index):
- f4.write(str(GoodNameList[filtered_index[i]])) # concatenate the contents
- final_y_pred=np.append(final_y_pred,num_cluster)
- f4.write("\n")
- i=i+1
- f4.close()
- np.savetxt('/home/it/middle_data/groupingResult.txt',final_y_pred,fmt="%i")
-
-
- #score function
- score_result=metrics.calinski_harabasz_score(x_original, y_pred)
- score_result_path="/home/it/middle_data/score_result_kshape.txt"
- f = open(score_result_path,"a+") # open in `w` mode to write
- f.write("\n")
- f.write("k="+str(num_cluster))
- f.write("\n")
- f.write("CA="+str(score_result)) # concatenate the contents
- f.close()
- pattern=np.zeros([num_cluster,len(X[0])], dtype = int)
- count= np.zeros([1,num_cluster],dtype = int)
- i=0
- while i<len(y_pattern):
- #print("final_test",pattern[y_pattern[i]],x_pattern[i])
- pattern[y_pattern[i]]=pattern[y_pattern[i]]+x_pattern[i]
- count[0][y_pattern[i]]=count[0][y_pattern[i]]+1
- i=i+1
- print("count=",count,"pattern=",pattern)
- final_result=np.zeros([num_cluster,len(X[0])], dtype = float)
- i=0
- while i<num_cluster:
- final_result[i]=np.true_divide(pattern[i],float(count[0][i]))
- i=i+1
- print("final_result=",final_result)
- i=0
- while i<len(final_result):
- j=0
- while j<len(final_result[0]):
- if final_result[i][j]>=0.3:
- final_result[i][j]=1
- else:
- final_result[i][j]=0
- j=j+1
- i=i+1
- #repeat fix scheduler
- middle_final_result=np.copy(final_result)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- c=np.concatenate((middle_final_result,final_result),axis=1)
- np.savetxt('/home/it/middle_data/pattern.txt',c,fmt="%i")
- def group_dtw():
- print("group_dtw")
- X=pd.read_csv('/home/it/middle_data/value.csv',header=None)
- x_original=X.copy()
- seed = 9
- num_cluster = 14
- X = TimeSeriesScalerMeanVariance(mu= 0.0 ,std= 1.0 ).fit_transform(X)
- km = TimeSeriesKMeans(n_clusters=num_cluster,n_init=1,metric="dtw",verbose=True,max_iter_barycenter=10,random_state=seed)
- y_pred = km.fit_predict(X)
- np.savetxt('/home/it/middle_data/groupingResult.txt',y_pred,fmt="%i")
- score_result=metrics.calinski_harabasz_score(x_original, y_pred)
- score_result_path="/home/it/middle_data/score_result_dtw.txt"
- f4 = open(score_result_path,"a+") # open in `w` mode to write
- f4.write("\n")
- f4.write(str(score_result)) # concatenate the contents
- f4.close()
- warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn", lineno=931)
- filter()
- #group_dtw()
|