123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.cluster import KMeans
- import time
- import pandas as pd
- import gensim
- from gensim.models import Doc2Vec
- import multiprocessing as mp
- import numpy as np
- import psutil
- import os
- import distance
- import sklearn.cluster
- from sklearn.metrics import adjusted_rand_score
- items = pd.read_csv("product.csv")
- items = np.asarray(items)
- documents = np.concatenate(items, axis=0)
- vectorizer = TfidfVectorizer(stop_words='english')
- X = vectorizer.fit_transform(documents)
- true_k = 2
- model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
- model.fit(X)
- print("Top terms per cluster:")
- order_centroids = model.cluster_centers_.argsort()[:, ::-1]
- terms = vectorizer.get_feature_names()
- for i in range(true_k):
- print("Interest group %d:" % i),
- for ind in order_centroids[i, :50]:
- print(' %s' % terms[ind]),
- print
- print("\n")
- print("Prediction")
- Y = vectorizer.transform(["hardrock", "movies", "music",])
- prediction = model.predict(Y)
- print(prediction)
- Y = vectorizer.transform(["cheese", "sports", "football","Yogurt"])
- prediction = model.predict(Y)
- print(prediction)
|