test5_word2vec.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import pandas as pd
  2. import numpy as np
  3. from pprint import pprint
  4. from collections import Counter
  5. #import re
  6. #from sklearn.metrics.pairwise import pairwise_distances
  7. from sklearn.metrics.pairwise import cosine_similarity
  8. from scipy import sparse
  9. #from sklearn.preprocessing import OneHotEncoder
  10. interest_group_centroids = [] # cluster centriods on which the interest groups are formed
  11. interest_groups = [] # Most similar items for each centroid in the interest group
  12. data = pd.read_csv('lastfm.csv') # Reading the CSV file
  13. #print(data)
  14. # Create a new dataframe without the user ids.
  15. data_items = data.drop('user', 1) # Drop the user column for item-item similarity calculation
  16. print('Dimension of loaded data is:', np.ndim(data_items)) #Dimension of the loaded data
  17. items_len = len(data_items.columns) #lengh of the items in the dataset
  18. length = [] #stores the index of the centroids
  19. print('\n\n#########################################CENTROIDS#####################################################\n\n')
  20. p = (items_len-1) // 5
  21. r = p
  22. length.append(p)
  23. for index in range(0, 4):
  24. items_len = int(round(r + p))
  25. r = items_len
  26. length.append(items_len)
  27. print(length)
  28. for index in length:
  29. centroids = data_items.columns.values[index]
  30. interest_group_centroids.append(centroids)
  31. print('The Centroids are = ', interest_group_centroids, '\n\n')
  32. '############SIMILARITY#################'
  33. '# As a first step we normalize the user vectors to unit vectors.'
  34. magnitude = np.sqrt(np.square(data_items).sum(axis=1))
  35. data_items = data_items.divide(magnitude, axis='index')
  36. '#print(data_items.head(5))'
  37. def calculate_similarity(data_items):
  38. data_sparse = sparse.csr_matrix(data_items)
  39. similarities = cosine_similarity(data_sparse.transpose())
  40. #print(similarities)
  41. sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)
  42. return sim
  43. '# Build the similarity matrix'
  44. data_matrix = calculate_similarity(data_items)
  45. '# Lets get the top 11 similar artists for Beyonce'
  46. print('##############INTEREST GROUPS##################\n\n')
  47. for i in interest_group_centroids:
  48. Interest_group = data_matrix.loc[i].nlargest(p).index.values
  49. print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')
  50. interest_groups.append(Interest_group)
  51. #print(interest_groups)
  52. print('###############USERS###################\n\n')
  53. user = 19695 # The id of the user for whom we want to generate recommendations
  54. user_index = data[data.user == user].index.tolist()[0] # Get the frame index
  55. #print('user index is: ', user_index)
  56. known_user_likes = data_items.ix[user_index]
  57. known_user_likes = known_user_likes[known_user_likes > 0].index.values
  58. print('user', user_index, 'likes', known_user_likes, '\n')
  59. print('###############USERS ASSOCIATION###################\n\n')
  60. for i in interest_groups:
  61. a_vals = Counter(i)
  62. b_vals = Counter(known_user_likes)
  63. # convert to word-vectors
  64. words = list(a_vals.keys() | b_vals.keys())
  65. a_vect = [a_vals.get(word, 0) for word in words]
  66. b_vect = [b_vals.get(word, 0) for word in words]
  67. # find cosine
  68. len_a = sum(av * av for av in a_vect) ** 0.5
  69. len_b = sum(bv * bv for bv in b_vect) ** 0.5
  70. dot = sum(av * bv for av, bv in zip(a_vect, b_vect))
  71. cosine = dot / (len_a * len_b)
  72. if cosine == 0:
  73. pass
  74. else:
  75. print('User:', user_index, 'is associated to the Interest group with similarity:', cosine)
  76. # def jaccard_similarity_score(df):
  77. # # """Calculate the column-wise cosine similarity for a sparse
  78. # # matrix. Return a new dataframe matrix with similarities.
  79. # # """
  80. # data_sparse = sparse.csr_matrix(df)
  81. # similarities = jaccard_similarity_score(data_sparse.transpose())
  82. # similarities = 1 - pairwise_distances(df.T, metric='hamming')
  83. # sim = pd.DataFrame(data=similarities, index=df.columns, columns=df.columns)
  84. # return sim
  85. # #
  86. # data_matrix2 = jaccard_similarity_score(df)
  87. # print(data_matrix2)
  88. # # #print(data_matrix.loc['aerosmith'].nlargest(6))
  89. # kmeans = KMeans(n_clusters=2, n_init=20, n_jobs=2)
  90. # kmeans.fit(data_matrix)
  91. # # We look at 3 the clusters generated by k-means.
  92. # common_words = kmeans.cluster_centers_[:1]
  93. # print(common_words)
  94. # #for num, centroid in enumerate(common_words):
  95. # #print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))