import json import matplotlib.pyplot as plt import matplotlib.patches as mpatches #from pprint import pprint import csv from collections import Counter from sklearn.metrics.pairwise import cosine_similarity from mlxtend.frequent_patterns import apriori from mlxtend.preprocessing import TransactionEncoder import pandas as pd from scipy import sparse import numpy as np import time import random from scipy.interpolate import make_interp_spline, BSpline import seaborn as sns import matplotlib.pyplot from sklearn.cluster import KMeans """ ######################DATASET INFORMATION########################################## The data was collected from the music streaming service Deezer (November 2017). These datasets represent friendship networks of users from 3 European countries. Nodes represent the users and edges are the mutual friendships. We reindexed the nodes in order to achieve a certain level of anonimity. The csv files contain the edges -- nodes are indexed from 0. The json files contain the genre preferences of users -- each key is a user id, the genres loved are given as lists. Genre notations are consistent across users.In each dataset users could like 84 distinct genres. Liked genre lists were compiled based on the liked song lists. The countries included are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges. """ start = time.time() with open('RO_genres.json') as data_file: data = json.load(data_file) '#print(data.keys())' users = [] # Users in the network who uses the service items = [] # Items liked by users in the network recommendations = [] # Recommendations generated to the users after mining frequent itemsets for key in data.keys(): # Retreiving the ID of each user users.append(key) for val in data.values(): # Retrieving the ITEMS liked by each user in the network items.append(val) '#print(users)' '#Users in the network, for example:' #['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772'] '#print(items)' '#Items liked by all te users in the network, for example:' #['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'], res = items my_df = pd.DataFrame(res) my_df.to_csv('out.csv', index=False, header=False) '#print(my_df.head())' '# Transposing the items and users into Binary matrix' te = TransactionEncoder() te_ary = te.fit(items).transform(items) df = pd.DataFrame(te_ary, columns=te.columns_) '#print(df.head())' '# prints the Binary matrix elements, for example:' # Acoustic Blues African Music ... Vocal jazz West Coast # 0 False False ... False False # 1 False False ... False False # 2 False False ... False False # 3 False False ... False False # 4 False False ... False False '#print(te.columns_)' # Resulting binary matrix to csv file res = df my_df = pd.DataFrame(res) my_df.to_csv('result.csv', index=True, header=True) data = pd.read_csv('result.csv') data.rename(columns={'Unnamed: 0': 'user'}, inplace=True) '#print(data.head())' '# prints the Binary matrix elements in result.csv, for example:' # user Acoustic Blues ... Vocal jazz West Coast # 0 0 False ... False False # 1 1 False ... False False # 2 2 False ... False False # 3 3 False ... False False # 4 4 False ... False False data_items = data.drop('user', 1) print('Dimension of loaded data is:', np.ndim(data_items)) interest_group_centroids = [] # cluster centriods on which the interest groups are formed interest_groups = [] # Most similar items for each centroid in the interest group items_len = len(data_items.columns) # lengh of the items in the dataset length = [] # stores the index of the centroids print(items_len) print('\n\n#########################################CENTROIDS#####################################################\n\n') p = (items_len-1) // 5 r = p length.append(p) for index in range(0, 4): items_len = int(round(r + p)) r = items_len length.append(items_len) '#print(length)' '#Index of the centroid elements, for example:' #[16, 32, 48, 64, 80] '# Calculating the centroids based on the length of the items in the DATASET: result.csv' for index in length: # for each centroid in the length centroids = data_items.columns.values[index] interest_group_centroids.append(centroids) #print('The Centroids are = ', interest_group_centroids, '\n\n') #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical'] print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n') start_time = time.time() '# As a first step we normalize the user vectors to unit vectors.' magnitude = np.sqrt(np.square(data_items).sum(axis=1)) data_items = data_items.divide(magnitude, axis='index') '#print(data_items.head(5))' def calculate_similarity(data_items): data_sparse = sparse.csr_matrix(data_items) similarities = cosine_similarity(data_sparse.transpose()) '#print(similarities)' sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns) return sim '# Build the similarity matrix' data_matrix = calculate_similarity(data_items) '#print(data_matrix.head())' end_time = time.time() print("the similarity computation time is--- %s seconds ---" % (end_time - start_time)) #''prints the item-item similarity matrix for all items in DATASET, for example:' # Acoustic Blues ... West Coast # Acoustic Blues 1.000000 ... 0.000000 # African Music 0.044191 ... 0.005636 # Alternative 0.008042 ... 0.028171 # Alternative Country 0.037340 ... 0.011230 # Asian Music 0.000000 ... 0.004623 print('\n\n#########################################INTEREST GROUPS###############################################\n\n') for i in interest_group_centroids: Interest_group = data_matrix.loc[i].nlargest(p).index.values print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n') interest_groups.append(Interest_group) sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2])) print(sim_clusuters) print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n') start_time = time.time() d = apriori(df, min_support=0.8, use_colnames=True, max_len=2) print((d['itemsets'])) print("--- %s seconds ---" % (time.time() - start_time)) print('#############################################USERS & THEIR LIKES###########################################\n\n') user = [2222] # The id of the user for whom we want to generate recommendations user_index = data[data.user == user].index.tolist()[0] # Get the frame index #print('user index is: ', user_index)' known_user_likes = data_items.ix[user_index] known_user_likes = known_user_likes[known_user_likes > 0].index.values print('user', user_index, 'likes', known_user_likes, '\n') groups = random.sample(data.user.tolist(), 20) print(groups) user2 = groups # The id of the user for whom we want to generate recommendations left = [] right = [] R = [] precision_y = [] recall_x = [] for i in user2: user_index = data[data.user == i].index.tolist()[0] # Get the frame index # print('user index is: ', user_index)' known_user_likes = data_items.ix[user_index] known_user_likes = known_user_likes[known_user_likes > 0].index.values print('user', user_index, 'likes', known_user_likes, '\n') for i in range(0, len(d['itemsets'])): f_s = d['itemsets'][i] # print('Recommendation', i, 'is: ', f_s LHS = f_s RHS = f_s l, *_ = LHS *_, r = RHS # print(l) left.append(l) right.append(r) # for index in range(1, len(Selected_users_association_IG)): # if l in set(Selected_users_association_IG[index]): # print(l,'exist')# LHS in user and if LHS present recommend if l in set(known_user_likes): print('user', user_index, 'gets recommendation:', r) R.append(r) precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS)) Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes) print('precision of user:', user_index, 'is', precision) print('Recall of user:', user_index, 'is', Recall) precision_y.append(precision) recall_x.append(Recall) print(precision_y) print(recall_x) """ Yana = [] plt.plot(x, Y40) plt.xlabel('Recall') plt.ylabel('Precision') plt.yscale('linear') plt.grid(False) plt.show() """ print('#############################################Accuracy plot###########################################\n\n') x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0]) """ fig = plt.figure() ax = plt.subplot(111) xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max spl = make_interp_spline(x_new, precision, k=2)#BSpline object #spl1 = make_interp_spline(x_new, YANA_doctor, k=3)#BSpline object power_smooth = spl(xnew) #power_smooth1 = spl1(xnew) plt.xlabel('Recall') plt.ylabel('Precision') #blue_patch = mpatches.Patch(color='blue', label='Proposed') #plt.legend(handles=[blue_patch]) #red_patch = mpatches.Patch(color='red', label='YANA') #plt.legend(handles=[red_patch]) ax.plot(xnew, power_smooth, 'c--', label='K = 40') #ax.plot(xnew, power_smooth1,label='YANA') ax.legend() plt.title('Deezer') plt.show() #Similarity = 1 - (len(set(y).intersection(set(Y40))) / len(set(y).union(set(Y40)))) # measures similarity between sets #print(Similarity) """ print('#############################################deezer group plot###########################################\n\n') fig = plt.figure() ax = plt.subplot(111) xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max #spl = make_interp_spline(x_new, precision, k=2)#BSpline object #spl1 = make_interp_spline(x_new, precision, k=2)#BSpline object #spl2 = make_interp_spline(x_new, precision, k=2)#BSpline object spl3 = make_interp_spline(x_new, precision, k=2)#BSpline object #power_smooth = spl(xnew) #power_smooth1 = spl1(xnew)# #power_smooth2 = spl2(xnew) power_smooth3 = spl3(xnew) plt.xlabel('Recall') plt.ylabel('Precision') #blue_patch = mpatches.Patch(color='blue', label='Proposed') #plt.legend(handles=[blue_patch]) #red_patch = mpatches.Patch(color='red', label='YANA') #plt.legend(handles=[red_patch]) #ax.plot(xnew, power_smooth, 'b--', label='K=10') #ax.plot(xnew, power_smooth1, 'm--', label='K=20') #ax.plot(xnew, power_smooth2, 'g--', label='K=30') ax.plot(xnew, power_smooth3, 'c--', label='K=40') ax.legend() plt.title('Deezer') plt.show() """ print('#############################################Similarity plot###########################################\n\n') x_new1 = np.asarray([50, 80, 150, 200]) xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max Sim_time = [0.2, 0.4, 0.7, 0.95] spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object power_smooth2 = spl(xnew1) plt.title('Computation cost of similarity calculation') plt.xlabel('Items') plt.ylabel('Time (in seconds)') plt.plot(xnew1, power_smooth2) plt.show() """ print('#############################################Recommendation plot###########################################\n\n') """ x_new1 = np.asarray([50, 80, 150, 200]) xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max Sim_time = [0.17, 0.30, 0.53, 0.71] spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object power_smooth2 = spl(xnew1) plt.title('Computation cost of recommendation generation') plt.xlabel('Number of items') plt.ylabel('Time (in seconds)') plt.plot(xnew1, power_smooth2) plt.show() """ print('#############################################comparision rec_sim###########################################\n\n') """ x_new1 = np.asarray([50, 80, 150, 200]) xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max Sim_time = [0.17, 0.30, 0.53, 0.71] spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object #spl1 = make_interp_spline(x_new, , k=3)#BSpline object power_smooth2 = spl(xnew1) plt.title('Computation cost of recommendation generation') plt.xlabel('Number of items') plt.ylabel('Time (in seconds)') plt.plot(xnew1, power_smooth2) plt.show() total_time = time.time() - start print(total_time) """ """ x_new1 = np.asarray([2, 3, 4, 5]) xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max Sim_cluster = [0.6, 0.3, 0.29, 0.32] spl = make_interp_spline(x_new1, Sim_cluster, k=3)#BSpline object power_smooth2 = spl(xnew1) plt.title('Interest group cluster analysis') plt.xlabel('Interest groups k') plt.ylabel('Similarity') plt.plot(xnew1, power_smooth2) plt.show() """ print('#############################################comparision with yana###########################################\n\n') fig = plt.figure() ax = plt.subplot(111) xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max spl = make_interp_spline(x_new, precision, k=2)#BSpline object spl1 = make_interp_spline(x_new, YANA_TvEnt, k=2)#BSpline object power_smooth = spl(xnew) power_smooth1 = spl1(xnew) plt.xlabel('Recall') plt.ylabel('Precision') #blue_patch = mpatches.Patch(color='blue', label='Proposed') #plt.legend(handles=[blue_patch]) #red_patch = mpatches.Patch(color='red', label='YANA') #plt.legend(handles=[red_patch]) ax.plot(xnew, power_smooth, 'b--', label='Deezer') ax.plot(xnew, power_smooth1, 'r--', label='Yana') ax.legend() plt.title('TvEnt') plt.show()