Test4_Doc2vec.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. from sklearn.feature_extraction.text import TfidfVectorizer
  2. from sklearn.cluster import KMeans
  3. from sklearn.metrics import adjusted_rand_score
  4. import numpy
  5. import pandas as pd
  6. import os
  7. import numpy as np
  8. texts = pd.read_csv("product.csv")
  9. print(os.path.getsize('product.csv'))
  10. texts = np.asarray(texts)
  11. texts = np.concatenate(texts, axis=0)
  12. # vectorization of the texts
  13. vectorizer = TfidfVectorizer(stop_words="english")
  14. X = vectorizer.fit_transform(texts)
  15. # used words (axis in our multi-dimensional space)
  16. words = vectorizer.get_feature_names()
  17. print("words", words)
  18. n_clusters=3
  19. number_of_seeds_to_try=10
  20. max_iter = 300
  21. number_of_process=2 # seads are distributed
  22. model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X)
  23. labels = model.labels_
  24. # indices of preferible words in each cluster
  25. ordered_words = model.cluster_centers_.argsort()[:, ::-1]
  26. print("centers:", model.cluster_centers_)
  27. print("labels", labels)
  28. print("intertia:", model.inertia_)
  29. texts_per_cluster = numpy.zeros(n_clusters)
  30. for i_cluster in range(n_clusters):
  31. for label in labels:
  32. if label==i_cluster:
  33. texts_per_cluster[i_cluster] +=1
  34. print("Top words per cluster:")
  35. for i_cluster in range(n_clusters):
  36. print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),
  37. for term in ordered_words[i_cluster, :10]:
  38. print("\t"+words[term])
  39. print("\n")
  40. print("Prediction")
  41. text_to_predict = "light"
  42. Y = vectorizer.transform([text_to_predict])
  43. predicted_cluster = model.predict(Y)[0]
  44. texts_per_cluster[predicted_cluster]+=1
  45. print(text_to_predict)
  46. print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),
  47. for term in ordered_words[predicted_cluster, :10]:
  48. print("\t"+words[term])