evaluation.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385
  1. import json
  2. import matplotlib.pyplot as plt
  3. import matplotlib.patches as mpatches
  4. #from pprint import pprint
  5. import csv
  6. from collections import Counter
  7. from sklearn.metrics.pairwise import cosine_similarity
  8. from mlxtend.frequent_patterns import apriori
  9. from mlxtend.preprocessing import TransactionEncoder
  10. import pandas as pd
  11. from scipy import sparse
  12. import numpy as np
  13. import time
  14. import random
  15. from scipy.interpolate import make_interp_spline, BSpline
  16. import seaborn as sns
  17. import matplotlib.pyplot
  18. from sklearn.cluster import KMeans
  19. """
  20. ######################DATASET INFORMATION##########################################
  21. The data was collected from the music streaming service Deezer (November 2017).
  22. These datasets represent friendship networks of users from 3 European countries.
  23. Nodes represent the users and edges are the mutual friendships. We reindexed the
  24. nodes in order to achieve a certain level of anonimity. The csv files contain the
  25. edges -- nodes are indexed from 0. The json files contain the genre preferences of
  26. users -- each key is a user id, the genres loved are given as lists. Genre notations
  27. are consistent across users.In each dataset users could like 84 distinct genres.
  28. Liked genre lists were compiled based on the liked song lists. The countries included
  29. are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.
  30. """
  31. start = time.time()
  32. with open('RO_genres.json') as data_file:
  33. data = json.load(data_file)
  34. '#print(data.keys())'
  35. users = [] # Users in the network who uses the service
  36. items = [] # Items liked by users in the network
  37. recommendations = [] # Recommendations generated to the users after mining frequent itemsets
  38. for key in data.keys(): # Retreiving the ID of each user
  39. users.append(key)
  40. for val in data.values(): # Retrieving the ITEMS liked by each user in the network
  41. items.append(val)
  42. '#print(users)'
  43. '#Users in the network, for example:'
  44. #['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']
  45. '#print(items)'
  46. '#Items liked by all te users in the network, for example:'
  47. #['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],
  48. res = items
  49. my_df = pd.DataFrame(res)
  50. my_df.to_csv('out.csv', index=False, header=False)
  51. '#print(my_df.head())'
  52. '# Transposing the items and users into Binary matrix'
  53. te = TransactionEncoder()
  54. te_ary = te.fit(items).transform(items)
  55. df = pd.DataFrame(te_ary, columns=te.columns_)
  56. '#print(df.head())'
  57. '# prints the Binary matrix elements, for example:'
  58. # Acoustic Blues African Music ... Vocal jazz West Coast
  59. # 0 False False ... False False
  60. # 1 False False ... False False
  61. # 2 False False ... False False
  62. # 3 False False ... False False
  63. # 4 False False ... False False
  64. '#print(te.columns_)'
  65. # Resulting binary matrix to csv file
  66. res = df
  67. my_df = pd.DataFrame(res)
  68. my_df.to_csv('result.csv', index=True, header=True)
  69. data = pd.read_csv('result.csv')
  70. data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)
  71. '#print(data.head())'
  72. '# prints the Binary matrix elements in result.csv, for example:'
  73. # user Acoustic Blues ... Vocal jazz West Coast
  74. # 0 0 False ... False False
  75. # 1 1 False ... False False
  76. # 2 2 False ... False False
  77. # 3 3 False ... False False
  78. # 4 4 False ... False False
  79. data_items = data.drop('user', 1)
  80. print('Dimension of loaded data is:', np.ndim(data_items))
  81. interest_group_centroids = [] # cluster centriods on which the interest groups are formed
  82. interest_groups = [] # Most similar items for each centroid in the interest group
  83. items_len = len(data_items.columns) # lengh of the items in the dataset
  84. length = [] # stores the index of the centroids
  85. print(items_len)
  86. print('\n\n#########################################CENTROIDS#####################################################\n\n')
  87. p = (items_len-1) // 5
  88. r = p
  89. length.append(p)
  90. for index in range(0, 4):
  91. items_len = int(round(r + p))
  92. r = items_len
  93. length.append(items_len)
  94. '#print(length)'
  95. '#Index of the centroid elements, for example:'
  96. #[16, 32, 48, 64, 80]
  97. '# Calculating the centroids based on the length of the items in the DATASET: result.csv'
  98. for index in length: # for each centroid in the length
  99. centroids = data_items.columns.values[index]
  100. interest_group_centroids.append(centroids)
  101. #print('The Centroids are = ', interest_group_centroids, '\n\n')
  102. #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']
  103. print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')
  104. start_time = time.time()
  105. '# As a first step we normalize the user vectors to unit vectors.'
  106. magnitude = np.sqrt(np.square(data_items).sum(axis=1))
  107. data_items = data_items.divide(magnitude, axis='index')
  108. '#print(data_items.head(5))'
  109. def calculate_similarity(data_items):
  110. data_sparse = sparse.csr_matrix(data_items)
  111. similarities = cosine_similarity(data_sparse.transpose())
  112. '#print(similarities)'
  113. sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
  114. return sim
  115. '# Build the similarity matrix'
  116. data_matrix = calculate_similarity(data_items)
  117. '#print(data_matrix.head())'
  118. end_time = time.time()
  119. print("the similarity computation time is--- %s seconds ---" % (end_time - start_time))
  120. #''prints the item-item similarity matrix for all items in DATASET, for example:'
  121. # Acoustic Blues ... West Coast
  122. # Acoustic Blues 1.000000 ... 0.000000
  123. # African Music 0.044191 ... 0.005636
  124. # Alternative 0.008042 ... 0.028171
  125. # Alternative Country 0.037340 ... 0.011230
  126. # Asian Music 0.000000 ... 0.004623
  127. print('\n\n#########################################INTEREST GROUPS###############################################\n\n')
  128. for i in interest_group_centroids:
  129. Interest_group = data_matrix.loc[i].nlargest(p).index.values
  130. print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
  131. interest_groups.append(Interest_group)
  132. sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2]))
  133. print(sim_clusuters)
  134. print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')
  135. start_time = time.time()
  136. d = apriori(df, min_support=0.8, use_colnames=True, max_len=2)
  137. print((d['itemsets']))
  138. print("--- %s seconds ---" % (time.time() - start_time))
  139. print('#############################################USERS & THEIR LIKES###########################################\n\n')
  140. user = [2222] # The id of the user for whom we want to generate recommendations
  141. user_index = data[data.user == user].index.tolist()[0] # Get the frame index
  142. #print('user index is: ', user_index)'
  143. known_user_likes = data_items.ix[user_index]
  144. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  145. print('user', user_index, 'likes', known_user_likes, '\n')
  146. groups = random.sample(data.user.tolist(), 20)
  147. print(groups)
  148. user2 = groups # The id of the user for whom we want to generate recommendations
  149. left = []
  150. right = []
  151. R = []
  152. precision_y = []
  153. recall_x = []
  154. for i in user2:
  155. user_index = data[data.user == i].index.tolist()[0] # Get the frame index
  156. # print('user index is: ', user_index)'
  157. known_user_likes = data_items.ix[user_index]
  158. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  159. print('user', user_index, 'likes', known_user_likes, '\n')
  160. for i in range(0, len(d['itemsets'])):
  161. f_s = d['itemsets'][i]
  162. # print('Recommendation', i, 'is: ', f_s
  163. LHS = f_s
  164. RHS = f_s
  165. l, *_ = LHS
  166. *_, r = RHS
  167. # print(l)
  168. left.append(l)
  169. right.append(r)
  170. # for index in range(1, len(Selected_users_association_IG)):
  171. # if l in set(Selected_users_association_IG[index]):
  172. # print(l,'exist')# LHS in user and if LHS present recommend
  173. if l in set(known_user_likes):
  174. print('user', user_index, 'gets recommendation:', r)
  175. R.append(r)
  176. precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS))
  177. Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes)
  178. print('precision of user:', user_index, 'is', precision)
  179. print('Recall of user:', user_index, 'is', Recall)
  180. precision_y.append(precision)
  181. recall_x.append(Recall)
  182. print(precision_y)
  183. print(recall_x)
  184. """
  185. Yana = []
  186. plt.plot(x, Y40)
  187. plt.xlabel('Recall')
  188. plt.ylabel('Precision')
  189. plt.yscale('linear')
  190. plt.grid(False)
  191. plt.show()
  192. """
  193. print('#############################################Accuracy plot###########################################\n\n')
  194. x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0])
  195. """
  196. fig = plt.figure()
  197. ax = plt.subplot(111)
  198. xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
  199. spl = make_interp_spline(x_new, precision, k=2)#BSpline object
  200. #spl1 = make_interp_spline(x_new, YANA_doctor, k=3)#BSpline object
  201. power_smooth = spl(xnew)
  202. #power_smooth1 = spl1(xnew)
  203. plt.xlabel('Recall')
  204. plt.ylabel('Precision')
  205. #blue_patch = mpatches.Patch(color='blue', label='Proposed')
  206. #plt.legend(handles=[blue_patch])
  207. #red_patch = mpatches.Patch(color='red', label='YANA')
  208. #plt.legend(handles=[red_patch])
  209. ax.plot(xnew, power_smooth, 'c--', label='K = 40')
  210. #ax.plot(xnew, power_smooth1,label='YANA')
  211. ax.legend()
  212. plt.title('Deezer')
  213. plt.show()
  214. #Similarity = 1 - (len(set(y).intersection(set(Y40))) / len(set(y).union(set(Y40)))) # measures similarity between sets
  215. #print(Similarity)
  216. """
  217. print('#############################################deezer group plot###########################################\n\n')
  218. fig = plt.figure()
  219. ax = plt.subplot(111)
  220. xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
  221. #spl = make_interp_spline(x_new, precision, k=2)#BSpline object
  222. #spl1 = make_interp_spline(x_new, precision, k=2)#BSpline object
  223. #spl2 = make_interp_spline(x_new, precision, k=2)#BSpline object
  224. spl3 = make_interp_spline(x_new, precision, k=2)#BSpline object
  225. #power_smooth = spl(xnew)
  226. #power_smooth1 = spl1(xnew)#
  227. #power_smooth2 = spl2(xnew)
  228. power_smooth3 = spl3(xnew)
  229. plt.xlabel('Recall')
  230. plt.ylabel('Precision')
  231. #blue_patch = mpatches.Patch(color='blue', label='Proposed')
  232. #plt.legend(handles=[blue_patch])
  233. #red_patch = mpatches.Patch(color='red', label='YANA')
  234. #plt.legend(handles=[red_patch])
  235. #ax.plot(xnew, power_smooth, 'b--', label='K=10')
  236. #ax.plot(xnew, power_smooth1, 'm--', label='K=20')
  237. #ax.plot(xnew, power_smooth2, 'g--', label='K=30')
  238. ax.plot(xnew, power_smooth3, 'c--', label='K=40')
  239. ax.legend()
  240. plt.title('Deezer')
  241. plt.show()
  242. """
  243. print('#############################################Similarity plot###########################################\n\n')
  244. x_new1 = np.asarray([50, 80, 150, 200])
  245. xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
  246. Sim_time = [0.2, 0.4, 0.7, 0.95]
  247. spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
  248. power_smooth2 = spl(xnew1)
  249. plt.title('Computation cost of similarity calculation')
  250. plt.xlabel('Items')
  251. plt.ylabel('Time (in seconds)')
  252. plt.plot(xnew1, power_smooth2)
  253. plt.show()
  254. """
  255. print('#############################################Recommendation plot###########################################\n\n')
  256. """
  257. x_new1 = np.asarray([50, 80, 150, 200])
  258. xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
  259. Sim_time = [0.17, 0.30, 0.53, 0.71]
  260. spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
  261. power_smooth2 = spl(xnew1)
  262. plt.title('Computation cost of recommendation generation')
  263. plt.xlabel('Number of items')
  264. plt.ylabel('Time (in seconds)')
  265. plt.plot(xnew1, power_smooth2)
  266. plt.show()
  267. """
  268. print('#############################################comparision rec_sim###########################################\n\n')
  269. """
  270. x_new1 = np.asarray([50, 80, 150, 200])
  271. xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
  272. Sim_time = [0.17, 0.30, 0.53, 0.71]
  273. spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object
  274. #spl1 = make_interp_spline(x_new, , k=3)#BSpline object
  275. power_smooth2 = spl(xnew1)
  276. plt.title('Computation cost of recommendation generation')
  277. plt.xlabel('Number of items')
  278. plt.ylabel('Time (in seconds)')
  279. plt.plot(xnew1, power_smooth2)
  280. plt.show()
  281. total_time = time.time() - start
  282. print(total_time)
  283. """
  284. """
  285. x_new1 = np.asarray([2, 3, 4, 5])
  286. xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
  287. Sim_cluster = [0.6, 0.3, 0.29, 0.32]
  288. spl = make_interp_spline(x_new1, Sim_cluster, k=3)#BSpline object
  289. power_smooth2 = spl(xnew1)
  290. plt.title('Interest group cluster analysis')
  291. plt.xlabel('Interest groups k')
  292. plt.ylabel('Similarity')
  293. plt.plot(xnew1, power_smooth2)
  294. plt.show()
  295. """
  296. print('#############################################comparision with yana###########################################\n\n')
  297. fig = plt.figure()
  298. ax = plt.subplot(111)
  299. xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
  300. spl = make_interp_spline(x_new, precision, k=2)#BSpline object
  301. spl1 = make_interp_spline(x_new, YANA_TvEnt, k=2)#BSpline object
  302. power_smooth = spl(xnew)
  303. power_smooth1 = spl1(xnew)
  304. plt.xlabel('Recall')
  305. plt.ylabel('Precision')
  306. #blue_patch = mpatches.Patch(color='blue', label='Proposed')
  307. #plt.legend(handles=[blue_patch])
  308. #red_patch = mpatches.Patch(color='red', label='YANA')
  309. #plt.legend(handles=[red_patch])
  310. ax.plot(xnew, power_smooth, 'b--', label='Deezer')
  311. ax.plot(xnew, power_smooth1, 'r--', label='Yana')
  312. ax.legend()
  313. plt.title('TvEnt')
  314. plt.show()