123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import numpy as np
- import time
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from sklearn.feature_extraction import text
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans
- from nltk.tokenize import RegexpTokenizer
- from nltk.stem.snowball import SnowballStemmer
- import psutil
- import os
- import multiprocessing as mp
- num_cores = mp.cpu_count()
- print("The kernal has",num_cores, "cores and you can find information regarding memory usage in",
- psutil.virtual_memory())
- items = pd.read_csv("product.csv")
- print(items.head())
- print(items.info())
- items[items['Items'].duplicated(keep=False)].sort_values('Items').head(8)
- items = items.drop_duplicates('Items')
- start_time = time.time()
- punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
- stop_words = text.ENGLISH_STOP_WORDS #commonly used words to ignore (such as and,or,is,etc)
- desc = items['Items'].values
- vectorizer = TfidfVectorizer(stop_words=stop_words)
- X = vectorizer.fit_transform(desc)
- word_features = vectorizer.get_feature_names()
- print(len(word_features))
- print(word_features[10000:10002])
- stemmer = SnowballStemmer('english')
- tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')
- def tokenize(text):
- return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]
- vectorizer2 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize)
- X2 = vectorizer2.fit_transform(desc)
- word_features2 = vectorizer2.get_feature_names()
- print(len(word_features2))
- print(word_features2[:10])
- vectorizer3 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, max_features=1000)
- X3 = vectorizer3.fit_transform(desc)
- words = vectorizer3.get_feature_names()
- size = []
- for i in range(1,11):
- kmeans = KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0)
- kmeans.fit(X3)
- size.append(kmeans.inertia_)
- plt.plot(range(1,11), size)
- plt.title('The Elbow Method')
- plt.xlabel('Number of clusters')
- plt.ylabel('WCSS')
- plt.savefig('elbow.png')
- plt.show()
- print(words[250:300])
- # n_init(number of iterations for clustering) n_jobs(number of cpu cores to use)
- kmeans = KMeans(n_clusters=3, n_init=20, n_jobs=2)
- kmeans.fit(X3)
- # We look at 3 the clusters generated by k-means.
- common_words = kmeans.cluster_centers_.argsort()[:, -1:-26:-1]
- print(common_words)
- for num, centroid in enumerate(common_words):
- print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))
- print("--- %s seconds ---" % (time.time() - start_time))
|