apriori.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import json
  2. import matplotlib.pyplot as plt
  3. import matplotlib.patches as mpatches
  4. #from pprint import pprint
  5. import csv
  6. from collections import Counter
  7. from sklearn.metrics.pairwise import cosine_similarity
  8. from mlxtend.frequent_patterns import apriori
  9. from mlxtend.preprocessing import TransactionEncoder
  10. import pandas as pd
  11. from scipy import sparse
  12. import numpy as np
  13. import time
  14. import random
  15. from scipy.interpolate import make_interp_spline, BSpline
  16. data = pd.read_csv('lastfm.csv')
  17. df = data.drop('user', 1)
  18. conv_df = df.astype(bool)
  19. start_time = time.time()
  20. d = apriori(conv_df, min_support=0.01, use_colnames=True, max_len=2)
  21. print((d['itemsets']))
  22. print("--- %s seconds ---" % (time.time() - start_time))
  23. interest_group_centroids = [] # cluster centriods on which the interest groups are formed
  24. interest_groups = [] # Most similar items for each centroid in the interest group
  25. items_len = len(df.columns) # lengh of the items in the dataset
  26. length = [] # stores the index of the centroids
  27. print(items_len)
  28. print('\n\n#########################################CENTROIDS#####################################################\n\n')
  29. p = (items_len-1) // 5
  30. r = p
  31. length.append(p)
  32. for index in range(0, 4):
  33. items_len = int(round(r + p))
  34. r = items_len
  35. length.append(items_len)
  36. '#print(length)'
  37. '#Index of the centroid elements, for example:'
  38. #[16, 32, 48, 64, 80]
  39. '# Calculating the centroids based on the length of the items in the DATASET: result.csv'
  40. for index in length: # for each centroid in the length
  41. centroids = df.columns.values[index]
  42. interest_group_centroids.append(centroids)
  43. #print('The Centroids are = ', interest_group_centroids, '\n\n')
  44. #For example: The Centroids are = ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']
  45. print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')
  46. start_time_sim = time.time()
  47. '# As a first step we normalize the user vectors to unit vectors.'
  48. magnitude = np.sqrt(np.square(df).sum(axis=1))
  49. data_items = df.divide(magnitude, axis='index')
  50. '#print(data_items.head(5))'
  51. def calculate_similarity(data_items):
  52. data_sparse = sparse.csr_matrix(data_items)
  53. similarities = cosine_similarity(data_sparse.transpose())
  54. '#print(similarities)'
  55. sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
  56. return sim
  57. '# Build the similarity matrix'
  58. data_matrix = calculate_similarity(data_items)
  59. '#print(data_matrix.head())'
  60. print("sim--- %s seconds ---" % (time.time() - start_time_sim))
  61. print('\n\n#########################################INTEREST GROUPS###############################################\n\n')
  62. for i in interest_group_centroids:
  63. Interest_group = data_matrix.loc[i].nlargest(p).index.values
  64. print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
  65. interest_groups.append(Interest_group)
  66. '#print(interest_groups)'
  67. sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2]))
  68. print(sim_clusuters)
  69. print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')
  70. start_time = time.time()
  71. d = apriori(df, min_support=0.1, use_colnames=True, max_len=2)
  72. print((d['itemsets']))
  73. print("--- %s seconds ---" % (time.time() - start_time))
  74. print('#############################################USERS & THEIR LIKES###########################################\n\n')
  75. groups = random.sample(data.user.tolist(),10)
  76. print(groups)
  77. user2 = groups # The id of the user for whom we want to generate recommendations
  78. left = []
  79. right = []
  80. R = []
  81. precision_y = []
  82. recall_x = []
  83. for i in user2:
  84. user_index = data[data.user == i].index.tolist()[0] # Get the frame index
  85. # print('user index is: ', user_index)'
  86. known_user_likes = data_items.ix[user_index]
  87. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  88. print('user', user_index, 'likes', known_user_likes, '\n')
  89. for i in range(0, len(d['itemsets'])):
  90. f_s = d['itemsets'][i]
  91. # print('Recommendation', i, 'is: ', f_s
  92. LHS = f_s
  93. RHS = f_s
  94. l, *_ = LHS
  95. *_, r = RHS
  96. # print(l)
  97. left.append(l)
  98. right.append(r)
  99. # for index in range(1, len(Selected_users_association_IG)):
  100. # if l in set(Selected_users_association_IG[index]):
  101. # print(l,'exist')# LHS in user and if LHS present recommend
  102. if l in set(known_user_likes):
  103. print('user', user_index, 'gets recommendation:', r)
  104. R.append(r)
  105. precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS))
  106. Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes)
  107. print('precision of user:', user_index, 'is', precision)
  108. print('Recall of user:', user_index, 'is', Recall)
  109. #precision_y.append(precision)
  110. #recall_x.append(Recall)
  111. print(precision_y)
  112. print(recall_x)
  113. """
  114. fig = plt.figure()
  115. ax = plt.subplot(111)
  116. x = [0.2, 0.4, 0.6, 0.8, 1.0]
  117. y = [1.0, 0.75, 0.5, 0.25]
  118. #Y10 = [1, 0.6, 0.4, 0.3, 0.2]
  119. Y20 = [1.0, 0.5, 0.4, 0.2, 0]
  120. Y30 = [1.0, 0.4, 0.3, 0.1, 0]
  121. Y40 = [1.0, 0.3, 0.2, 0.1, 0]
  122. x_new1 = np.asarray([2, 3, 4, 5])
  123. xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max
  124. Sim_cluster = [0.37, 0.32, 0.04, 0.09]
  125. spl = make_interp_spline(x_new1, Sim_cluster, k=2)#BSpline object
  126. power_smooth2 = spl(xnew1)
  127. plt.title('Interest group cluster analysis')
  128. plt.xlabel('Interest groups k')
  129. plt.ylabel('Similarity')
  130. ax.plot(xnew1, power_smooth2, 'm', label='lastfm')
  131. ax.legend()
  132. plt.show()
  133. """
  134. x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0])
  135. Y10 = [1.002, 0.81, 0.5, 0.4, 0.2]
  136. Y20 = [1.0, 0.78, 0.52, 0.41, 0.25]
  137. Y30 = [1.04, 0.79, 0.53, 0.37, 0.24]
  138. Y40 = [1.02, 0.80, 0.51, 0.42, 0.23]
  139. YANA_music = [0.71, 0.5, 0.4, 0.3, 0.2]
  140. YANA_TvEnt = [0.82, 0.6, 0.5, 0.3, 0.2]
  141. YANA_movie = [0.71, 0.5, 0.4, 0.2, 0.1]
  142. YANA_doctor = [0.72, 0.4, 0.3, 0.2, 0.1]
  143. fig = plt.figure()
  144. ax = plt.subplot(111)
  145. xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max
  146. #spl = make_interp_spline(x_new, Y10, k=2)#BSpline object
  147. #spl1 = make_interp_spline(x_new, Y20, k=2)#BSpline object
  148. #spl2 = make_interp_spline(x_new, Y30, k=2)#BSpline object
  149. spl3 = make_interp_spline(x_new, Y40, k=2)#BSpline object
  150. #spl1 = make_interp_spline(x_new, YANA_doctor, k=2)#BSpline object
  151. #power_smooth = spl(xnew)
  152. #power_smooth1 = spl1(xnew)
  153. #power_smooth2 = spl2(xnew)
  154. power_smooth3 = spl3(xnew)
  155. plt.xlabel('Recall')
  156. plt.ylabel('Precision')
  157. #blue_patch = mpatches.Patch(color='blue', label='Proposed')
  158. #plt.legend(handles=[blue_patch])
  159. #red_patch = mpatches.Patch(color='red', label='YANA')
  160. #plt.legend(handles=[red_patch])
  161. #ax.plot(xnew, power_smooth, 'b--', label='K =10')
  162. #ax.plot(xnew, power_smooth1, 'm--', label='K=20')
  163. #ax.plot(xnew, power_smooth2, 'g--', label='K=30')
  164. ax.plot(xnew, power_smooth3, 'c--', label='K=40')
  165. #ax.plot(xnew, power_smooth1,label='YANA')
  166. ax.legend()
  167. plt.title('Lastfm')
  168. plt.show()