test2_kmeans.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from sklearn.feature_extraction.text import TfidfVectorizer
  2. from sklearn.cluster import KMeans
  3. import time
  4. import pandas as pd
  5. import gensim
  6. from gensim.models import Doc2Vec
  7. import multiprocessing as mp
  8. import numpy as np
  9. import psutil
  10. import os
  11. import distance
  12. import sklearn.cluster
  13. from sklearn.metrics import adjusted_rand_score
  14. items = pd.read_csv("product.csv")
  15. items = np.asarray(items)
  16. documents = np.concatenate(items, axis=0)
  17. vectorizer = TfidfVectorizer(stop_words='english')
  18. X = vectorizer.fit_transform(documents)
  19. true_k = 2
  20. model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
  21. model.fit(X)
  22. print("Top terms per cluster:")
  23. order_centroids = model.cluster_centers_.argsort()[:, ::-1]
  24. terms = vectorizer.get_feature_names()
  25. for i in range(true_k):
  26. print("Interest group %d:" % i),
  27. for ind in order_centroids[i, :50]:
  28. print(' %s' % terms[ind]),
  29. print
  30. print("\n")
  31. print("Prediction")
  32. Y = vectorizer.transform(["hardrock", "movies", "music",])
  33. prediction = model.predict(Y)
  34. print(prediction)
  35. Y = vectorizer.transform(["cheese", "sports", "football","Yogurt"])
  36. prediction = model.predict(Y)
  37. print(prediction)