transpose.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260
  1. import json
  2. import matplotlib.pyplot as plt
  3. #from pprint import pprint
  4. import csv
  5. from collections import Counter
  6. from sklearn.metrics.pairwise import cosine_similarity
  7. from mlxtend.frequent_patterns import apriori
  8. from mlxtend.preprocessing import TransactionEncoder
  9. import pandas as pd
  10. from scipy import sparse
  11. import numpy as np
  12. import time
  13. """
  14. ######################DATASET INFORMATION##########################################
  15. The data was collected from the music streaming service Deezer (November 2017).
  16. These datasets represent friendship networks of users from 3 European countries.
  17. Nodes represent the users and edges are the mutual friendships. We reindexed the
  18. nodes in order to achieve a certain level of anonimity. The csv files contain the
  19. edges -- nodes are indexed from 0. The json files contain the genre preferences of
  20. users -- each key is a user id, the genres loved are given as lists. Genre notations
  21. are consistent across users.In each dataset users could like 84 distinct genres.
  22. Liked genre lists were compiled based on the liked song lists. The countries included
  23. are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.
  24. """
  25. with open('RO_genres.json') as data_file:
  26. data = json.load(data_file)
  27. '#print(data.keys())'
  28. users = [] # Users in the network who uses the service
  29. items = [] # Items liked by users in the network
  30. recommendations = [] # Recommendations generated to the users after mining frequent itemsets
  31. for key in data.keys(): # Retreiving the ID of each user
  32. users.append(key)
  33. for val in data.values(): # Retrieving the ITEMS liked by each user in the network
  34. items.append(val)
  35. '#print(users)'
  36. '#Users in the network, for example:'
  37. #['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']
  38. '#print(items)'
  39. '#Items liked by all te users in the network, for example:'
  40. #['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],
  41. res = items
  42. my_df = pd.DataFrame(res)
  43. my_df.to_csv('out.csv', index=False, header=False)
  44. '#print(my_df.head())'
  45. '# Transposing the items and users into Binary matrix'
  46. te = TransactionEncoder()
  47. te_ary = te.fit(items).transform(items)
  48. df = pd.DataFrame(te_ary, columns=te.columns_)
  49. '#print(df.head())'
  50. '# prints the Binary matrix elements, for example:'
  51. # Acoustic Blues African Music ... Vocal jazz West Coast
  52. # 0 False False ... False False
  53. # 1 False False ... False False
  54. # 2 False False ... False False
  55. # 3 False False ... False False
  56. # 4 False False ... False False
  57. '#print(te.columns_)'
  58. # Resulting binary matrix to csv file
  59. res = df
  60. my_df = pd.DataFrame(res)
  61. my_df.to_csv('result.csv', index=True, header=True)
  62. data = pd.read_csv('result.csv')
  63. data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)
  64. '#print(data.head())'
  65. '# prints the Binary matrix elements in result.csv, for example:'
  66. # user Acoustic Blues ... Vocal jazz West Coast
  67. # 0 0 False ... False False
  68. # 1 1 False ... False False
  69. # 2 2 False ... False False
  70. # 3 3 False ... False False
  71. # 4 4 False ... False False
  72. data_items = data.drop('user', 1)
  73. print('Dimension of loaded data is:', np.ndim(data_items))
  74. interest_group_centroids = [] # cluster centriods on which the interest groups are formed
  75. interest_groups = [] # Most similar items for each centroid in the interest group
  76. items_len = len(data_items.columns) # lengh of the items in the dataset
  77. length = [] # stores the index of the centroids
  78. print(items_len)
  79. print('\n\n#########################################CENTROIDS#####################################################\n\n')
  80. p = (items_len-1) // 6
  81. r = p
  82. length.append(p)
  83. for index in range(0, 3):
  84. items_len = int(round(r + p))
  85. r = items_len
  86. length.append(items_len)
  87. '#print(length)'
  88. '#Index of the centroid elements, for example:'
  89. #[16, 32, 48, 64, 80]
  90. '# Calculating the centroids based on the length of the items in the DATASET: result.csv'
  91. for index in length: # for each centroid in the length
  92. centroids = data_items.columns.values[index]
  93. interest_group_centroids.append(centroids)
  94. #print('The Centroids are = ', interest_group_centroids, '\n\n')
  95. #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']
  96. print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')
  97. start_time = time.time()
  98. '# As a first step we normalize the user vectors to unit vectors.'
  99. magnitude = np.sqrt(np.square(data_items).sum(axis=1))
  100. data_items = data_items.divide(magnitude, axis='index')
  101. '#print(data_items.head(5))'
  102. def calculate_similarity(data_items):
  103. data_sparse = sparse.csr_matrix(data_items)
  104. similarities = cosine_similarity(data_sparse.transpose())
  105. '#print(similarities)'
  106. sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
  107. return sim
  108. '# Build the similarity matrix'
  109. data_matrix = calculate_similarity(data_items)
  110. '#print(data_matrix.head())'
  111. #''prints the item-item similarity matrix for all items in DATASET, for example:'
  112. # Acoustic Blues ... West Coast
  113. # Acoustic Blues 1.000000 ... 0.000000
  114. # African Music 0.044191 ... 0.005636
  115. # Alternative 0.008042 ... 0.028171
  116. # Alternative Country 0.037340 ... 0.011230
  117. # Asian Music 0.000000 ... 0.004623
  118. print("--- %s seconds ---" % (time.time() - start_time))
  119. print('\n\n#########################################INTEREST GROUPS###############################################\n\n')
  120. for i in interest_group_centroids:
  121. Interest_group = data_matrix.loc[i].nlargest(p).index.values
  122. print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
  123. interest_groups.append(Interest_group)
  124. '#print(interest_groups)'
  125. print(set(interest_groups[1]).intersection(interest_groups[3]))
  126. print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')
  127. start_time = time.time()
  128. d = apriori(df, min_support=0.2, use_colnames=True, max_len=5)
  129. print((d['itemsets']))
  130. print("--- %s seconds ---" % (time.time() - start_time))
  131. print('#############################################USERS & THEIR LIKES###########################################\n\n')
  132. user = [2222] # The id of the user for whom we want to generate recommendations
  133. user_index = data[data.user == user].index.tolist()[0] # Get the frame index
  134. #print('user index is: ', user_index)'
  135. known_user_likes = data_items.ix[user_index]
  136. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  137. print('user', user_index, 'likes', known_user_likes, '\n')
  138. print('#############################################USERS ASSOCIATION TO INTEREST GROUPS##########################\n\n')
  139. # for i in interest_groups:
  140. # a_vals = Counter(i)
  141. # b_vals = Counter(known_user_likes)
  142. #
  143. # # convert to word-vectors, for Example:
  144. # # [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  145. # # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  146. # # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
  147. # # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  148. #
  149. # words = list(a_vals.keys() | b_vals.keys())
  150. # a_vect = [a_vals.get(word, 0) for word in words]
  151. # b_vect = [b_vals.get(word, 0) for word in words]
  152. # # find cosine
  153. # len_a = sum(av * av for av in a_vect) ** 0.5
  154. # len_b = sum(bv * bv for bv in b_vect) ** 0.5
  155. # dot = sum(av * bv for av, bv in zip(a_vect, b_vect))
  156. # cosine = dot / (len_a * len_b)
  157. #
  158. # if cosine == 0:
  159. # pass
  160. # else:
  161. # #print('User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', cosine)
  162. # print('')
  163. Selected_users_association_IG = [user]
  164. for i in interest_groups:
  165. interest_groups_set = set(i)
  166. user_likes_set = set(known_user_likes)
  167. sim_num = user_likes_set.intersection(interest_groups_set)
  168. sim_den = user_likes_set.union(interest_groups_set)
  169. sim = len(sim_num)/len(sim_den)
  170. if sim > 0:
  171. g = 'User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', sim
  172. ass_interest_groups = i
  173. Selected_users_association_IG.append(ass_interest_groups.tolist())
  174. print(Selected_users_association_IG[1])
  175. #user_likes_set.intersection(interest_groups_set)
  176. print('\n\n#########################################CLIENT_SIDE_RECOMMENDATIONS###################################\n\n')
  177. left = []
  178. right = []
  179. R = []
  180. for i in range(0, len(d['itemsets'])):
  181. f_s = d['itemsets'][i]
  182. #print('Recommendation', i, 'is: ', f_s
  183. LHS = f_s
  184. RHS = f_s
  185. l, *_ = LHS
  186. *_, r = RHS
  187. #print(l)
  188. left.append(l)
  189. right.append(r)
  190. #for index in range(1, len(Selected_users_association_IG)):
  191. #if l in set(Selected_users_association_IG[index]):
  192. #print(l,'exist')# LHS in user and if LHS present recommend
  193. if l in set(known_user_likes):
  194. print('user', user_index, 'gets recommendation:', r)
  195. R.append(r)
  196. precision = len(set(known_user_likes).intersection(set(R))) / len(set(R))
  197. Recall = len(set(known_user_likes).intersection(set(R))) / len(known_user_likes)
  198. #print('Items to be checked in users list', l, '\n')
  199. #print('If item', l, 'is present', 'recommend: ', r, '\n')