123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385 |
- import json
- import matplotlib.pyplot as plt
- import matplotlib.patches as mpatches
- #from pprint import pprint
- import csv
- from collections import Counter
- from sklearn.metrics.pairwise import cosine_similarity
- from mlxtend.frequent_patterns import apriori
- from mlxtend.preprocessing import TransactionEncoder
- import pandas as pd
- from scipy import sparse
- import numpy as np
- import time
- import random
- from scipy.interpolate import make_interp_spline, BSpline
- import seaborn as sns
- import matplotlib.pyplot
- from sklearn.cluster import KMeans
- """
- ######################DATASET INFORMATION##########################################
- The data was collected from the music streaming service Deezer (November 2017).
- These datasets represent friendship networks of users from 3 European countries.
- Nodes represent the users and edges are the mutual friendships. We reindexed the
- nodes in order to achieve a certain level of anonimity. The csv files contain the
- edges -- nodes are indexed from 0. The json files contain the genre preferences of
- users -- each key is a user id, the genres loved are given as lists. Genre notations
- are consistent across users.In each dataset users could like 84 distinct genres.
- Liked genre lists were compiled based on the liked song lists. The countries included
- are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.
- """
- start = time.time()
- with open('RO_genres.json') as data_file:
- data = json.load(data_file)
- '#print(data.keys())'
- users = [] # Users in the network who uses the service
- items = [] # Items liked by users in the network
- recommendations = [] # Recommendations generated to the users after mining frequent itemsets
- for key in data.keys(): # Retreiving the ID of each user
- users.append(key)
- for val in data.values(): # Retrieving the ITEMS liked by each user in the network
- items.append(val)
- '#print(users)'
- '#Users in the network, for example:'
- #['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']
- '#print(items)'
- '#Items liked by all te users in the network, for example:'
- #['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],
- res = items
- my_df = pd.DataFrame(res)
- my_df.to_csv('out.csv', index=False, header=False)
- '#print(my_df.head())'
- '# Transposing the items and users into Binary matrix'
- te = TransactionEncoder()
- te_ary = te.fit(items).transform(items)
- df = pd.DataFrame(te_ary, columns=te.columns_)
- '#print(df.head())'
- '# prints the Binary matrix elements, for example:'
- # Acoustic Blues African Music ... Vocal jazz West Coast
- # 0 False False ... False False
- # 1 False False ... False False
- # 2 False False ... False False
- # 3 False False ... False False
- # 4 False False ... False False
- '#print(te.columns_)'
- # Resulting binary matrix to csv file
- res = df
- my_df = pd.DataFrame(res)
- my_df.to_csv('result.csv', index=True, header=True)
- data = pd.read_csv('result.csv')
- data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)
- '#print(data.head())'
- '# prints the Binary matrix elements in result.csv, for example:'
- # user Acoustic Blues ... Vocal jazz West Coast
- # 0 0 False ... False False
- # 1 1 False ... False False
- # 2 2 False ... False False
- # 3 3 False ... False False
- # 4 4 False ... False False
- data_items = data.drop('user', 1)
- print('Dimension of loaded data is:', np.ndim(data_items))
- interest_group_centroids = [] # cluster centriods on which the interest groups are formed
- interest_groups = [] # Most similar items for each centroid in the interest group
- items_len = len(data_items.columns) # lengh of the items in the dataset
- length = [] # stores the index of the centroids
- print(items_len)
- print('\n\n#########################################CENTROIDS#####################################################\n\n')
- p = (items_len-1) // 5
- r = p
- length.append(p)
- for index in range(0, 4):
- items_len = int(round(r + p))
- r = items_len
- length.append(items_len)
- '#print(length)'
- '#Index of the centroid elements, for example:'
- #[16, 32, 48, 64, 80]
- '# Calculating the centroids based on the length of the items in the DATASET: result.csv'
- for index in length: # for each centroid in the length
- centroids = data_items.columns.values[index]
- interest_group_centroids.append(centroids)
- #print('The Centroids are = ', interest_group_centroids, '\n\n')
- #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']
- print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')
- start_time = time.time()
- '# As a first step we normalize the user vectors to unit vectors.'
- magnitude = np.sqrt(np.square(data_items).sum(axis=1))
- data_items = data_items.divide(magnitude, axis='index')
- '#print(data_items.head(5))'
- def calculate_similarity(data_items):
- data_sparse = sparse.csr_matrix(data_items)
- similarities = cosine_similarity(data_sparse.transpose())
- '#print(similarities)'
- sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
- return sim
- '# Build the similarity matrix'
- data_matrix = calculate_similarity(data_items)
- '#print(data_matrix.head())'
- end_time = time.time()
- print("the similarity computation time is--- %s seconds ---" % (end_time - start_time))
- #''prints the item-item similarity matrix for all items in DATASET, for example:'
- # Acoustic Blues ... West Coast
- # Acoustic Blues 1.000000 ... 0.000000
- # African Music 0.044191 ... 0.005636
- # Alternative 0.008042 ... 0.028171
- # Alternative Country 0.037340 ... 0.011230
- # Asian Music 0.000000 ... 0.004623
- print('\n\n#########################################INTEREST GROUPS###############################################\n\n')
- for i in interest_group_centroids:
- Interest_group = data_matrix.loc[i].nlargest(p).index.values
- print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
- interest_groups.append(Interest_group)
- sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2]))
- print(sim_clusuters)
- print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')
- start_time = time.time()
- d = apriori(df, min_support=0.8, use_colnames=True, max_len=2)
- print((d['itemsets']))
- print("--- %s seconds ---" % (time.time() - start_time))
- print('#############################################USERS & THEIR LIKES###########################################\n\n')
- user = [2222] # The id of the user for whom we want to generate recommendations
- user_index = data[data.user == user].index.tolist()[0] # Get the frame index
- #print('user index is: ', user_index)'
- known_user_likes = data_items.ix[user_index]
- known_user_likes = known_user_likes[known_user_likes > 0].index.values
- print('user', user_index, 'likes', known_user_likes, '\n')
- groups = random.sample(data.user.tolist(), 20)
- print(groups)
- user2 = groups # The id of the user for whom we want to generate recommendations
- left = []
- right = []
- R = []
- precision_y = []
- recall_x = []
- for i in user2:
- user_index = data[data.user == i].index.tolist()[0] # Get the frame index
- # print('user index is: ', user_index)'
- known_user_likes = data_items.ix[user_index]
- known_user_likes = known_user_likes[known_user_likes > 0].index.values
- print('user', user_index, 'likes', known_user_likes, '\n')
- for i in range(0, len(d['itemsets'])):
- f_s = d['itemsets'][i]
- # print('Recommendation', i, 'is: ', f_s
- LHS = f_s
- RHS = f_s
- l, *_ = LHS
- *_, r = RHS
- # print(l)
- left.append(l)
- right.append(r)
- # for index in range(1, len(Selected_users_association_IG)):
- # if l in set(Selected_users_association_IG[index]):
- # print(l,'exist')# LHS in user and if LHS present recommend
- if l in set(known_user_likes):
- print('user', user_index, 'gets recommendation:', r)
- R.append(r)
- precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS))
- Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes)
- print('precision of user:', user_index, 'is', precision)
- print('Recall of user:', user_index, 'is', Recall)
- precision_y.append(precision)
- recall_x.append(Recall)
- print(precision_y)
- print(recall_x)
- """
- Yana = []
- plt.plot(x, Y40)
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- plt.yscale('linear')
- plt.grid(False)
- plt.show()
- """
- print('#############################################Accuracy plot###########################################\n\n')
- x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0])
- """
- fig = plt.figure()
- ax = plt.subplot(111)
- xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
- spl = make_interp_spline(x_new, precision, k=2)#BSpline object
- #spl1 = make_interp_spline(x_new, YANA_doctor, k=3)#BSpline object
- power_smooth = spl(xnew)
- #power_smooth1 = spl1(xnew)
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- #blue_patch = mpatches.Patch(color='blue', label='Proposed')
- #plt.legend(handles=[blue_patch])
- #red_patch = mpatches.Patch(color='red', label='YANA')
- #plt.legend(handles=[red_patch])
- ax.plot(xnew, power_smooth, 'c--', label='K = 40')
- #ax.plot(xnew, power_smooth1,label='YANA')
- ax.legend()
- plt.title('Deezer')
- plt.show()
- #Similarity = 1 - (len(set(y).intersection(set(Y40))) / len(set(y).union(set(Y40)))) # measures similarity between sets
- #print(Similarity)
- """
- print('#############################################deezer group plot###########################################\n\n')
- fig = plt.figure()
- ax = plt.subplot(111)
- xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
- #spl = make_interp_spline(x_new, precision, k=2)#BSpline object
- #spl1 = make_interp_spline(x_new, precision, k=2)#BSpline object
- #spl2 = make_interp_spline(x_new, precision, k=2)#BSpline object
- spl3 = make_interp_spline(x_new, precision, k=2)#BSpline object
- #power_smooth = spl(xnew)
- #power_smooth1 = spl1(xnew)#
- #power_smooth2 = spl2(xnew)
- power_smooth3 = spl3(xnew)
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- #blue_patch = mpatches.Patch(color='blue', label='Proposed')
- #plt.legend(handles=[blue_patch])
- #red_patch = mpatches.Patch(color='red', label='YANA')
- #plt.legend(handles=[red_patch])
- #ax.plot(xnew, power_smooth, 'b--', label='K=10')
- #ax.plot(xnew, power_smooth1, 'm--', label='K=20')
- #ax.plot(xnew, power_smooth2, 'g--', label='K=30')
- ax.plot(xnew, power_smooth3, 'c--', label='K=40')
- ax.legend()
- plt.title('Deezer')
- plt.show()
- """
- print('#############################################Similarity plot###########################################\n\n')
- x_new1 = np.asarray([50, 80, 150, 200])
- xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
- Sim_time = [0.2, 0.4, 0.7, 0.95]
- spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
- power_smooth2 = spl(xnew1)
- plt.title('Computation cost of similarity calculation')
- plt.xlabel('Items')
- plt.ylabel('Time (in seconds)')
- plt.plot(xnew1, power_smooth2)
- plt.show()
- """
- print('#############################################Recommendation plot###########################################\n\n')
- """
- x_new1 = np.asarray([50, 80, 150, 200])
- xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
- Sim_time = [0.17, 0.30, 0.53, 0.71]
- spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
- power_smooth2 = spl(xnew1)
- plt.title('Computation cost of recommendation generation')
- plt.xlabel('Number of items')
- plt.ylabel('Time (in seconds)')
- plt.plot(xnew1, power_smooth2)
- plt.show()
- """
- print('#############################################comparision rec_sim###########################################\n\n')
- """
- x_new1 = np.asarray([50, 80, 150, 200])
- xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
- Sim_time = [0.17, 0.30, 0.53, 0.71]
- spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
- #spl1 = make_interp_spline(x_new, , k=3)#BSpline object
- power_smooth2 = spl(xnew1)
- plt.title('Computation cost of recommendation generation')
- plt.xlabel('Number of items')
- plt.ylabel('Time (in seconds)')
- plt.plot(xnew1, power_smooth2)
- plt.show()
- total_time = time.time() - start
- print(total_time)
- """
- """
- x_new1 = np.asarray([2, 3, 4, 5])
- xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
- Sim_cluster = [0.6, 0.3, 0.29, 0.32]
- spl = make_interp_spline(x_new1, Sim_cluster, k=3)#BSpline object
- power_smooth2 = spl(xnew1)
- plt.title('Interest group cluster analysis')
- plt.xlabel('Interest groups k')
- plt.ylabel('Similarity')
- plt.plot(xnew1, power_smooth2)
- plt.show()
- """
- print('#############################################comparision with yana###########################################\n\n')
- fig = plt.figure()
- ax = plt.subplot(111)
- xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
- spl = make_interp_spline(x_new, precision, k=2)#BSpline object
- spl1 = make_interp_spline(x_new, YANA_TvEnt, k=2)#BSpline object
- power_smooth = spl(xnew)
- power_smooth1 = spl1(xnew)
- plt.xlabel('Recall')
- plt.ylabel('Precision')
- #blue_patch = mpatches.Patch(color='blue', label='Proposed')
- #plt.legend(handles=[blue_patch])
- #red_patch = mpatches.Patch(color='red', label='YANA')
- #plt.legend(handles=[red_patch])
- ax.plot(xnew, power_smooth, 'b--', label='Deezer')
- ax.plot(xnew, power_smooth1, 'r--', label='Yana')
- ax.legend()
- plt.title('TvEnt')
- plt.show()
|