from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from sklearn.metrics import adjusted_rand_score import numpy import pandas as pd import os import numpy as np texts = pd.read_csv("product.csv") print(os.path.getsize('product.csv')) texts = np.asarray(texts) texts = np.concatenate(texts, axis=0) # vectorization of the texts vectorizer = TfidfVectorizer(stop_words="english") X = vectorizer.fit_transform(texts) # used words (axis in our multi-dimensional space) words = vectorizer.get_feature_names() print("words", words) n_clusters=3 number_of_seeds_to_try=10 max_iter = 300 number_of_process=2 # seads are distributed model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X) labels = model.labels_ # indices of preferible words in each cluster ordered_words = model.cluster_centers_.argsort()[:, ::-1] print("centers:", model.cluster_centers_) print("labels", labels) print("intertia:", model.inertia_) texts_per_cluster = numpy.zeros(n_clusters) for i_cluster in range(n_clusters): for label in labels: if label==i_cluster: texts_per_cluster[i_cluster] +=1 print("Top words per cluster:") for i_cluster in range(n_clusters): print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])), for term in ordered_words[i_cluster, :10]: print("\t"+words[term]) print("\n") print("Prediction") text_to_predict = "light" Y = vectorizer.transform([text_to_predict]) predicted_cluster = model.predict(Y)[0] texts_per_cluster[predicted_cluster]+=1 print(text_to_predict) print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])), for term in ordered_words[predicted_cluster, :10]: print("\t"+words[term])