import numpy as np import time import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.feature_extraction import text from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans from nltk.tokenize import RegexpTokenizer from nltk.stem.snowball import SnowballStemmer import psutil import os import multiprocessing as mp num_cores = mp.cpu_count() print("The kernal has",num_cores, "cores and you can find information regarding memory usage in", psutil.virtual_memory()) items = pd.read_csv("product.csv") print(items.head()) print(items.info()) items[items['Items'].duplicated(keep=False)].sort_values('Items').head(8) items = items.drop_duplicates('Items') start_time = time.time() punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"] stop_words = text.ENGLISH_STOP_WORDS #commonly used words to ignore (such as and,or,is,etc) desc = items['Items'].values vectorizer = TfidfVectorizer(stop_words=stop_words) X = vectorizer.fit_transform(desc) word_features = vectorizer.get_feature_names() print(len(word_features)) print(word_features[10000:10002]) stemmer = SnowballStemmer('english') tokenizer = RegexpTokenizer(r'[a-zA-Z\']+') def tokenize(text): return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())] vectorizer2 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize) X2 = vectorizer2.fit_transform(desc) word_features2 = vectorizer2.get_feature_names() print(len(word_features2)) print(word_features2[:10]) vectorizer3 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, max_features=1000) X3 = vectorizer3.fit_transform(desc) words = vectorizer3.get_feature_names() size = [] for i in range(1,11): kmeans = KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(X3) size.append(kmeans.inertia_) plt.plot(range(1,11), size) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.savefig('elbow.png') plt.show() print(words[250:300]) # n_init(number of iterations for clustering) n_jobs(number of cpu cores to use) kmeans = KMeans(n_clusters=3, n_init=20, n_jobs=2) kmeans.fit(X3) # We look at 3 the clusters generated by k-means. common_words = kmeans.cluster_centers_.argsort()[:, -1:-26:-1] print(common_words) for num, centroid in enumerate(common_words): print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid)) print("--- %s seconds ---" % (time.time() - start_time))