12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061 |
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans
- from sklearn.metrics import adjusted_rand_score
- import numpy
- import pandas as pd
- import os
- import numpy as np
- texts = pd.read_csv("product.csv")
- print(os.path.getsize('product.csv'))
- texts = np.asarray(texts)
- texts = np.concatenate(texts, axis=0)
- # vectorization of the texts
- vectorizer = TfidfVectorizer(stop_words="english")
- X = vectorizer.fit_transform(texts)
- # used words (axis in our multi-dimensional space)
- words = vectorizer.get_feature_names()
- print("words", words)
- n_clusters=3
- number_of_seeds_to_try=10
- max_iter = 300
- number_of_process=2 # seads are distributed
- model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X)
- labels = model.labels_
- # indices of preferible words in each cluster
- ordered_words = model.cluster_centers_.argsort()[:, ::-1]
- print("centers:", model.cluster_centers_)
- print("labels", labels)
- print("intertia:", model.inertia_)
- texts_per_cluster = numpy.zeros(n_clusters)
- for i_cluster in range(n_clusters):
- for label in labels:
- if label==i_cluster:
- texts_per_cluster[i_cluster] +=1
- print("Top words per cluster:")
- for i_cluster in range(n_clusters):
- print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
- for term in ordered_words[i_cluster, :10]:
- print("\t"+words[term])
- print("\n")
- print("Prediction")
- text_to_predict = "light"
- Y = vectorizer.transform([text_to_predict])
- predicted_cluster = model.predict(Y)[0]
- texts_per_cluster[predicted_cluster]+=1
- print(text_to_predict)
- print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),
- for term in ordered_words[predicted_cluster, :10]:
- print("\t"+words[term])
|