test3_kmeans.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import numpy as np
  2. import time
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. from sklearn.feature_extraction import text
  7. from sklearn.feature_extraction.text import TfidfVectorizer
  8. from sklearn.cluster import KMeans
  9. from nltk.tokenize import RegexpTokenizer
  10. from nltk.stem.snowball import SnowballStemmer
  11. import psutil
  12. import os
  13. import multiprocessing as mp
  14. num_cores = mp.cpu_count()
  15. print("The kernal has",num_cores, "cores and you can find information regarding memory usage in",
  16. psutil.virtual_memory())
  17. items = pd.read_csv("product.csv")
  18. print(items.head())
  19. print(items.info())
  20. items[items['Items'].duplicated(keep=False)].sort_values('Items').head(8)
  21. items = items.drop_duplicates('Items')
  22. start_time = time.time()
  23. punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
  24. stop_words = text.ENGLISH_STOP_WORDS #commonly used words to ignore (such as and,or,is,etc)
  25. desc = items['Items'].values
  26. vectorizer = TfidfVectorizer(stop_words=stop_words)
  27. X = vectorizer.fit_transform(desc)
  28. word_features = vectorizer.get_feature_names()
  29. print(len(word_features))
  30. print(word_features[10000:10002])
  31. stemmer = SnowballStemmer('english')
  32. tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')
  33. def tokenize(text):
  34. return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]
  35. vectorizer2 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize)
  36. X2 = vectorizer2.fit_transform(desc)
  37. word_features2 = vectorizer2.get_feature_names()
  38. print(len(word_features2))
  39. print(word_features2[:10])
  40. vectorizer3 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, max_features=1000)
  41. X3 = vectorizer3.fit_transform(desc)
  42. words = vectorizer3.get_feature_names()
  43. size = []
  44. for i in range(1,11):
  45. kmeans = KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0)
  46. kmeans.fit(X3)
  47. size.append(kmeans.inertia_)
  48. plt.plot(range(1,11), size)
  49. plt.title('The Elbow Method')
  50. plt.xlabel('Number of clusters')
  51. plt.ylabel('WCSS')
  52. plt.savefig('elbow.png')
  53. plt.show()
  54. print(words[250:300])
  55. # n_init(number of iterations for clustering) n_jobs(number of cpu cores to use)
  56. kmeans = KMeans(n_clusters=3, n_init=20, n_jobs=2)
  57. kmeans.fit(X3)
  58. # We look at 3 the clusters generated by k-means.
  59. common_words = kmeans.cluster_centers_.argsort()[:, -1:-26:-1]
  60. print(common_words)
  61. for num, centroid in enumerate(common_words):
  62. print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))
  63. print("--- %s seconds ---" % (time.time() - start_time))