PPRM.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import json
  2. #from pprint import pprint
  3. import csv
  4. from collections import Counter
  5. from sklearn.metrics.pairwise import cosine_similarity
  6. from mlxtend.frequent_patterns import apriori
  7. from mlxtend.preprocessing import TransactionEncoder
  8. import pandas as pd
  9. from scipy import sparse
  10. import numpy as np
  11. ######################DATASET INFORMATION##########################################
  12. # The data was collected from the music streaming service Deezer (November 2017).
  13. # These datasets represent friendship networks of users from 3 European countries.
  14. # Nodes represent the users and edges are the mutual friendships. We reindexed the
  15. # nodes in order to achieve a certain level of anonimity. The csv files contain the
  16. # edges -- nodes are indexed from 0. The json files contain the genre preferences of
  17. # users -- each key is a user id, the genres loved are given as lists. Genre notations
  18. # are consistent across users.In each dataset users could like 84 distinct genres.
  19. # Liked genre lists were compiled based on the liked song lists. The countries included
  20. # are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.
  21. with open('RO_genres.json') as data_file:
  22. data = json.load(data_file)
  23. '#print(data.keys())'
  24. users = [] # Users in the network who uses the service
  25. items = [] # Items liked by users in the network
  26. recommendations = [] # Recommendations generated to the users after mining frequent itemsets
  27. for key in data.keys(): # Retreiving the ID of each user
  28. users.append(key)
  29. for val in data.values(): # Retrieving the ITEMS liked by each user in the network
  30. items.append(val)
  31. '#print(users)'
  32. '#Users in the network, for example:'
  33. #['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']
  34. '#print(items)'
  35. '#Items liked by all te users in the network, for example:'
  36. #['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],
  37. res = items
  38. my_df = pd.DataFrame(res)
  39. my_df.to_csv('out.csv', index=False, header=False)
  40. '#print(my_df.head())'
  41. '# Transposing the items and users into Binary matrix'
  42. te = TransactionEncoder()
  43. te_ary = te.fit(items).transform(items)
  44. df = pd.DataFrame(te_ary, columns=te.columns_)
  45. '#print(df.head())'
  46. '# prints the Binary matrix elements, for example:'
  47. # Acoustic Blues African Music ... Vocal jazz West Coast
  48. # 0 False False ... False False
  49. # 1 False False ... False False
  50. # 2 False False ... False False
  51. # 3 False False ... False False
  52. # 4 False False ... False False
  53. '#print(te.columns_)'
  54. # Resulting binary matrix to csv file
  55. res = df
  56. my_df = pd.DataFrame(res)
  57. my_df.to_csv('result.csv', index=True, header=True)
  58. data = pd.read_csv('result.csv')
  59. data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)
  60. '#print(data.head())'
  61. '# prints the Binary matrix elements in result.csv, for example:'
  62. # user Acoustic Blues ... Vocal jazz West Coast
  63. # 0 0 False ... False False
  64. # 1 1 False ... False False
  65. # 2 2 False ... False False
  66. # 3 3 False ... False False
  67. # 4 4 False ... False False
  68. data_items = data.drop('user', 1)
  69. print('Dimension of loaded data is:', np.ndim(data_items))
  70. interest_group_centroids = [] # cluster centriods on which the interest groups are formed
  71. interest_groups = [] # Most similar items for each centroid in the interest group
  72. items_len = len(data_items.columns) # lengh of the items in the dataset
  73. length = [] # stores the index of the centroids
  74. print('\n\n#########################################CENTROIDS#####################################################\n\n')
  75. p = (items_len-1) // 5
  76. r = p
  77. length.append(p)
  78. for index in range(0, 4):
  79. items_len = int(round(r + p))
  80. r = items_len
  81. length.append(items_len)
  82. '#print(length)'
  83. '#Index of the centroid elements, for example:'
  84. #[16, 32, 48, 64, 80]
  85. '# Calculating the centroids based on the length of the items in the DATASET: result.csv'
  86. for index in length: # for each centroid in the length
  87. centroids = data_items.columns.values[index]
  88. interest_group_centroids.append(centroids)
  89. #print('The Centroids are = ', interest_group_centroids, '\n\n')
  90. #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']
  91. print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')
  92. '# As a first step we normalize the user vectors to unit vectors.'
  93. magnitude = np.sqrt(np.square(data_items).sum(axis=1))
  94. data_items = data_items.divide(magnitude, axis='index')
  95. '#print(data_items.head(5))'
  96. def calculate_similarity(data_items):
  97. data_sparse = sparse.csr_matrix(data_items)
  98. similarities = cosine_similarity(data_sparse.transpose())
  99. '#print(similarities)'
  100. sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
  101. return sim
  102. '# Build the similarity matrix'
  103. data_matrix = calculate_similarity(data_items)
  104. '#print(data_matrix.head())'
  105. #''prints the item-item similarity matrix for all items in DATASET, for example:'
  106. # Acoustic Blues ... West Coast
  107. # Acoustic Blues 1.000000 ... 0.000000
  108. # African Music 0.044191 ... 0.005636
  109. # Alternative 0.008042 ... 0.028171
  110. # Alternative Country 0.037340 ... 0.011230
  111. # Asian Music 0.000000 ... 0.004623
  112. print('\n\n#########################################INTEREST GROUPS###############################################\n\n')
  113. for i in interest_group_centroids:
  114. Interest_group = data_matrix.loc[i].nlargest(p).index.values
  115. print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
  116. interest_groups.append(Interest_group)
  117. '#print(interest_groups)'
  118. print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')
  119. d = apriori(df, min_support=0.2, use_colnames=True, max_len=2)
  120. print((d['itemsets']))
  121. print('#############################################USERS & THEIR LIKES###########################################\n\n')
  122. user = [41000] # The id of the user for whom we want to generate recommendations
  123. user_index = data[data.user == user].index.tolist()[0] # Get the frame index
  124. #print('user index is: ', user_index)'
  125. known_user_likes = data_items.ix[user_index]
  126. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  127. print('user', user_index, 'likes', known_user_likes, '\n')
  128. print('#############################################USERS ASSOCIATION TO INTEREST GROUPS##########################\n\n')
  129. # for i in interest_groups:
  130. # a_vals = Counter(i)
  131. # b_vals = Counter(known_user_likes)
  132. #
  133. # # convert to word-vectors, for Example:
  134. # # [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  135. # # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  136. # # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  137. # # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  138. #
  139. # words = list(a_vals.keys() | b_vals.keys())
  140. # a_vect = [a_vals.get(word, 0) for word in words]
  141. # b_vect = [b_vals.get(word, 0) for word in words]
  142. # # find cosine
  143. # len_a = sum(av * av for av in a_vect) ** 0.5
  144. # len_b = sum(bv * bv for bv in b_vect) ** 0.5
  145. # dot = sum(av * bv for av, bv in zip(a_vect, b_vect))
  146. # cosine = dot / (len_a * len_b)
  147. #
  148. # if cosine == 0:
  149. # pass
  150. # else:
  151. # #print('User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', cosine)
  152. # print('')
  153. Selected_users_association_IG = [user]
  154. for i in interest_groups:
  155. interest_groups_set = set(i)
  156. user_likes_set = set(known_user_likes)
  157. sim_num = user_likes_set.intersection(interest_groups_set)
  158. sim_den = user_likes_set.union(interest_groups_set)
  159. sim = len(sim_num)/len(sim_den)
  160. if sim > 0:
  161. g = 'User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', sim
  162. ass_interest_groups = i
  163. Selected_users_association_IG.append(ass_interest_groups.tolist())
  164. print(Selected_users_association_IG[1])
  165. #user_likes_set.intersection(interest_groups_set)
  166. print('\n\n#########################################CLIENT_SIDE_RECOMMENDATIONS###################################\n\n')
  167. left = []
  168. right = []
  169. for i in range(0, len(d['itemsets'])):
  170. f_s = d['itemsets'][i]
  171. #print('Recommendation', i, 'is: ', f_s)
  172. LHS = f_s
  173. RHS = f_s
  174. l, *_ = LHS
  175. *_, r = RHS
  176. print(l)
  177. left.append(l)
  178. right.append(r)
  179. for index in range(1, len(Selected_users_association_IG)):
  180. if l in set(Selected_users_association_IG[index]):
  181. #print(l,'exist')# LHS in user and if LHS present recommend
  182. if l in set(known_user_likes):
  183. print('user', user_index, 'gets recommendation:', r)
  184. #print('Items to be checked in users list', l, '\n')
  185. #print('If item', l, 'is present', 'recommend: ', r, '\n')