123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- import pandas as pd
- import numpy as np
- from pprint import pprint
- from collections import Counter
- #import re
- #from sklearn.metrics.pairwise import pairwise_distances
- from sklearn.metrics.pairwise import cosine_similarity
- from scipy import sparse
- #from sklearn.preprocessing import OneHotEncoder
- interest_group_centroids = [] # cluster centriods on which the interest groups are formed
- interest_groups = [] # Most similar items for each centroid in the interest group
- data = pd.read_csv('lastfm.csv') # Reading the CSV file
- #print(data)
- # Create a new dataframe without the user ids.
- data_items = data.drop('user', 1) # Drop the user column for item-item similarity calculation
- print('Dimension of loaded data is:', np.ndim(data_items)) #Dimension of the loaded data
- items_len = len(data_items.columns) #lengh of the items in the dataset
- length = [] #stores the index of the centroids
- print('\n\n#########################################CENTROIDS#####################################################\n\n')
- p = (items_len-1) // 5
- r = p
- length.append(p)
- for index in range(0, 4):
- items_len = int(round(r + p))
- r = items_len
- length.append(items_len)
- print(length)
- for index in length:
- centroids = data_items.columns.values[index]
- interest_group_centroids.append(centroids)
- print('The Centroids are = ', interest_group_centroids, '\n\n')
- '############SIMILARITY#################'
- '# As a first step we normalize the user vectors to unit vectors.'
- magnitude = np.sqrt(np.square(data_items).sum(axis=1))
- data_items = data_items.divide(magnitude, axis='index')
- '#print(data_items.head(5))'
- def calculate_similarity(data_items):
- data_sparse = sparse.csr_matrix(data_items)
- similarities = cosine_similarity(data_sparse.transpose())
- #print(similarities)
- sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
- return sim
- '# Build the similarity matrix'
- data_matrix = calculate_similarity(data_items)
- '# Lets get the top 11 similar artists for Beyonce'
- print('##############INTEREST GROUPS##################\n\n')
- for i in interest_group_centroids:
- Interest_group = data_matrix.loc[i].nlargest(p).index.values
- print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
- interest_groups.append(Interest_group)
- #print(interest_groups)
- print('###############USERS###################\n\n')
- user = 19695 # The id of the user for whom we want to generate recommendations
- user_index = data[data.user == user].index.tolist()[0] # Get the frame index
- #print('user index is: ', user_index)
- known_user_likes = data_items.ix[user_index]
- known_user_likes = known_user_likes[known_user_likes > 0].index.values
- print('user', user_index, 'likes', known_user_likes, '\n')
- print('###############USERS ASSOCIATION###################\n\n')
- for i in interest_groups:
- a_vals = Counter(i)
- b_vals = Counter(known_user_likes)
- # convert to word-vectors
- words = list(a_vals.keys() | b_vals.keys())
- a_vect = [a_vals.get(word, 0) for word in words]
- b_vect = [b_vals.get(word, 0) for word in words]
- # find cosine
- len_a = sum(av * av for av in a_vect) ** 0.5
- len_b = sum(bv * bv for bv in b_vect) ** 0.5
- dot = sum(av * bv for av, bv in zip(a_vect, b_vect))
- cosine = dot / (len_a * len_b)
- if cosine == 0:
- pass
- else:
- print('User:', user_index, 'is associated to the Interest group with similarity:', cosine)
- # def jaccard_similarity_score(df):
- # # """Calculate the column-wise cosine similarity for a sparse
- # # matrix. Return a new dataframe matrix with similarities.
- # # """
- # data_sparse = sparse.csr_matrix(df)
- # similarities = jaccard_similarity_score(data_sparse.transpose())
- # similarities = 1 - pairwise_distances(df.T, metric='hamming')
- # sim = pd.DataFrame(data=similarities, index=df.columns, columns=df.columns)
- # return sim
- # #
- # data_matrix2 = jaccard_similarity_score(df)
- # print(data_matrix2)
- # # #print(data_matrix.loc['aerosmith'].nlargest(6))
- # kmeans = KMeans(n_clusters=2, n_init=20, n_jobs=2)
- # kmeans.fit(data_matrix)
- # # We look at 3 the clusters generated by k-means.
- # common_words = kmeans.cluster_centers_[:1]
- # print(common_words)
- # #for num, centroid in enumerate(common_words):
- # #print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))
|