6 years ago · 3cd175c952
--- a/code/Test4_Doc2vec.py
+++ b/code/Test4_Doc2vec.py
@@ -0,0 +1,61 @@
 
				+from sklearn.feature_extraction.text import TfidfVectorizer

			
 
				+from sklearn.cluster import KMeans

			
 
				+from sklearn.metrics import adjusted_rand_score

			
 
				+import numpy

			
 
				+import pandas as pd

			
 
				+import os

			
 
				+import numpy as np

			
 
				+

			
 
				+

			
 
				+texts = pd.read_csv("product.csv")

			
 
				+print(os.path.getsize('product.csv'))

			
 
				+

			
 
				+texts = np.asarray(texts)

			
 
				+texts = np.concatenate(texts, axis=0)

			
 
				+

			
 
				+# vectorization of the texts

			
 
				+vectorizer = TfidfVectorizer(stop_words="english")

			
 
				+X = vectorizer.fit_transform(texts)

			
 
				+# used words (axis in our multi-dimensional space)

			
 
				+words = vectorizer.get_feature_names()

			
 
				+print("words", words)

			
 
				+

			
 
				+

			
 
				+n_clusters=3

			
 
				+number_of_seeds_to_try=10

			
 
				+max_iter = 300

			
 
				+number_of_process=2 # seads are distributed

			
 
				+model = KMeans(n_clusters=n_clusters, max_iter=max_iter, n_init=number_of_seeds_to_try, n_jobs=number_of_process).fit(X)

			
 
				+

			
 
				+labels = model.labels_

			
 
				+# indices of preferible words in each cluster

			
 
				+ordered_words = model.cluster_centers_.argsort()[:, ::-1]

			
 
				+

			
 
				+print("centers:", model.cluster_centers_)

			
 
				+print("labels", labels)

			
 
				+print("intertia:", model.inertia_)

			
 
				+

			
 
				+texts_per_cluster = numpy.zeros(n_clusters)

			
 
				+for i_cluster in range(n_clusters):

			
 
				+    for label in labels:

			
 
				+        if label==i_cluster:

			
 
				+            texts_per_cluster[i_cluster] +=1

			
 
				+

			
 
				+print("Top words per cluster:")

			
 
				+for i_cluster in range(n_clusters):

			
 
				+    print("Cluster:", i_cluster, "texts:", int(texts_per_cluster[i_cluster])),

			
 
				+    for term in ordered_words[i_cluster, :10]:

			
 
				+        print("\t"+words[term])

			
 
				+

			
 
				+print("\n")

			
 
				+print("Prediction")

			
 
				+

			
 
				+text_to_predict = "light"

			
 
				+Y = vectorizer.transform([text_to_predict])

			
 
				+predicted_cluster = model.predict(Y)[0]

			
 
				+texts_per_cluster[predicted_cluster]+=1

			
 
				+

			
 
				+print(text_to_predict)

			
 
				+print("Cluster:", predicted_cluster, "texts:", int(texts_per_cluster[predicted_cluster])),

			
 
				+for term in ordered_words[predicted_cluster, :10]:

			
 
				+    print("\t"+words[term])
			
--- a/code/apriori.py
+++ b/code/apriori.py
@@ -0,0 +1,218 @@
 
				+import json

			
 
				+import matplotlib.pyplot as plt

			
 
				+import matplotlib.patches as mpatches

			
 
				+#from pprint import pprint

			
 
				+import csv

			
 
				+from collections import Counter

			
 
				+from sklearn.metrics.pairwise import cosine_similarity

			
 
				+from mlxtend.frequent_patterns import apriori

			
 
				+from mlxtend.preprocessing import TransactionEncoder

			
 
				+import pandas as pd

			
 
				+from scipy import sparse

			
 
				+import numpy as np

			
 
				+import time

			
 
				+import random

			
 
				+from scipy.interpolate import make_interp_spline, BSpline

			
 
				+

			
 
				+

			
 
				+

			
 
				+data = pd.read_csv('lastfm.csv')

			
 
				+

			
 
				+df = data.drop('user', 1)

			
 
				+

			
 
				+conv_df = df.astype(bool)

			
 
				+

			
 
				+start_time = time.time()

			
 
				+

			
 
				+d = apriori(conv_df, min_support=0.01, use_colnames=True, max_len=2)

			
 
				+print((d['itemsets']))

			
 
				+

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+interest_group_centroids = []                               # cluster centriods on which the interest groups are formed

			
 
				+interest_groups = []                                        # Most similar items for each centroid in the interest group

			
 
				+items_len = len(df.columns)                         # lengh of the items in the dataset

			
 
				+length = []  # stores the index of the centroids

			
 
				+print(items_len)

			
 
				+

			
 
				+print('\n\n#########################################CENTROIDS#####################################################\n\n')

			
 
				+

			
 
				+p = (items_len-1) // 5

			
 
				+r = p

			
 
				+length.append(p)

			
 
				+

			
 
				+for index in range(0, 4):

			
 
				+    items_len = int(round(r + p))

			
 
				+    r = items_len

			
 
				+    length.append(items_len)

			
 
				+'#print(length)'

			
 
				+'#Index of the centroid elements, for example:'

			
 
				+#[16, 32, 48, 64, 80]

			
 
				+

			
 
				+'# Calculating the centroids based on the length of the items in the DATASET: result.csv'

			
 
				+

			
 
				+for index in length:                                        # for each centroid in the length

			
 
				+    centroids = df.columns.values[index]

			
 
				+    interest_group_centroids.append(centroids)

			
 
				+#print('The Centroids are = ', interest_group_centroids, '\n\n')

			
 
				+#For example: The Centroids are =  ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']

			
 
				+

			
 
				+

			
 
				+print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')

			
 
				+start_time_sim = time.time()

			
 
				+'# As a first step we normalize the user vectors to unit vectors.'

			
 
				+

			
 
				+magnitude = np.sqrt(np.square(df).sum(axis=1))

			
 
				+data_items = df.divide(magnitude, axis='index')

			
 
				+

			
 
				+'#print(data_items.head(5))'

			
 
				+

			
 
				+

			
 
				+def calculate_similarity(data_items):

			
 
				+    data_sparse = sparse.csr_matrix(data_items)

			
 
				+    similarities = cosine_similarity(data_sparse.transpose())

			
 
				+    '#print(similarities)'

			
 
				+    sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)

			
 
				+    return sim

			
 
				+

			
 
				+'# Build the similarity matrix'

			
 
				+data_matrix = calculate_similarity(data_items)

			
 
				+'#print(data_matrix.head())'

			
 
				+

			
 
				+print("sim--- %s seconds ---" % (time.time() - start_time_sim))

			
 
				+

			
 
				+

			
 
				+print('\n\n#########################################INTEREST GROUPS###############################################\n\n')

			
 
				+

			
 
				+

			
 
				+for i in interest_group_centroids:

			
 
				+    Interest_group = data_matrix.loc[i].nlargest(p).index.values

			
 
				+    print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')

			
 
				+    interest_groups.append(Interest_group)

			
 
				+'#print(interest_groups)'

			
 
				+

			
 
				+sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2]))

			
 
				+print(sim_clusuters)

			
 
				+

			
 
				+print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')

			
 
				+

			
 
				+start_time = time.time()

			
 
				+d = apriori(df, min_support=0.1, use_colnames=True, max_len=2)

			
 
				+print((d['itemsets']))

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+print('#############################################USERS & THEIR LIKES###########################################\n\n')

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+groups = random.sample(data.user.tolist(),10)

			
 
				+print(groups)

			
 
				+

			
 
				+user2 = groups     # The id of the user for whom we want to generate recommendations

			
 
				+left = []

			
 
				+right = []

			
 
				+R = []

			
 
				+precision_y = []

			
 
				+recall_x = []

			
 
				+

			
 
				+for i in user2:

			
 
				+    user_index = data[data.user == i].index.tolist()[0]  # Get the frame index

			
 
				+    # print('user index is: ', user_index)'

			
 
				+    known_user_likes = data_items.ix[user_index]

			
 
				+    known_user_likes = known_user_likes[known_user_likes > 0].index.values

			
 
				+    print('user', user_index, 'likes', known_user_likes, '\n')

			
 
				+

			
 
				+

			
 
				+    for i in range(0, len(d['itemsets'])):

			
 
				+        f_s = d['itemsets'][i]

			
 
				+        # print('Recommendation', i, 'is: ', f_s

			
 
				+        LHS = f_s

			
 
				+        RHS = f_s

			
 
				+        l, *_ = LHS

			
 
				+        *_, r = RHS

			
 
				+        # print(l)

			
 
				+        left.append(l)

			
 
				+        right.append(r)

			
 
				+        # for index in range(1, len(Selected_users_association_IG)):

			
 
				+        # if l in set(Selected_users_association_IG[index]):

			
 
				+        # print(l,'exist')# LHS in user and if LHS present recommend

			
 
				+        if l in set(known_user_likes):

			
 
				+            print('user', user_index, 'gets recommendation:', r)

			
 
				+            R.append(r)

			
 
				+            precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS))

			
 
				+            Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes)

			
 
				+            print('precision of user:', user_index, 'is', precision)

			
 
				+            print('Recall of user:', user_index, 'is', Recall)

			
 
				+    #precision_y.append(precision)

			
 
				+    #recall_x.append(Recall)

			
 
				+

			
 
				+print(precision_y)

			
 
				+print(recall_x)

			
 
				+

			
 
				+"""

			
 
				+fig = plt.figure()

			
 
				+ax = plt.subplot(111)

			
 
				+x = [0.2, 0.4, 0.6, 0.8, 1.0]

			
 
				+y = [1.0, 0.75, 0.5, 0.25]

			
 
				+#Y10 = [1, 0.6, 0.4, 0.3, 0.2]

			
 
				+Y20 = [1.0, 0.5, 0.4, 0.2, 0]

			
 
				+Y30 = [1.0, 0.4, 0.3, 0.1, 0]

			
 
				+Y40 = [1.0, 0.3, 0.2, 0.1, 0]

			
 
				+

			
 
				+

			
 
				+x_new1 = np.asarray([2, 3, 4, 5])

			
 
				+xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+Sim_cluster = [0.37, 0.32, 0.04, 0.09]

			
 
				+spl = make_interp_spline(x_new1, Sim_cluster, k=2)#BSpline object

			
 
				+power_smooth2 = spl(xnew1)

			
 
				+plt.title('Interest group cluster analysis')

			
 
				+plt.xlabel('Interest groups k')

			
 
				+plt.ylabel('Similarity')

			
 
				+ax.plot(xnew1, power_smooth2, 'm', label='lastfm')

			
 
				+ax.legend()

			
 
				+plt.show()

			
 
				+"""

			
 
				+x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0])

			
 
				+

			
 
				+Y10 = [1.002, 0.81, 0.5, 0.4, 0.2]

			
 
				+Y20 = [1.0, 0.78, 0.52, 0.41, 0.25]

			
 
				+Y30 = [1.04, 0.79, 0.53, 0.37, 0.24]

			
 
				+Y40 = [1.02, 0.80, 0.51, 0.42, 0.23]

			
 
				+YANA_music = [0.71, 0.5, 0.4, 0.3, 0.2]

			
 
				+YANA_TvEnt = [0.82, 0.6, 0.5, 0.3, 0.2]

			
 
				+YANA_movie = [0.71, 0.5, 0.4, 0.2, 0.1]

			
 
				+YANA_doctor = [0.72, 0.4, 0.3, 0.2, 0.1]

			
 
				+

			
 
				+

			
 
				+fig = plt.figure()

			
 
				+ax = plt.subplot(111)

			
 
				+

			
 
				+xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+#spl = make_interp_spline(x_new, Y10, k=2)#BSpline object

			
 
				+#spl1 = make_interp_spline(x_new, Y20, k=2)#BSpline object

			
 
				+#spl2 = make_interp_spline(x_new, Y30, k=2)#BSpline object

			
 
				+spl3 = make_interp_spline(x_new, Y40, k=2)#BSpline object

			
 
				+

			
 
				+#spl1 = make_interp_spline(x_new, YANA_doctor, k=2)#BSpline object

			
 
				+#power_smooth = spl(xnew)

			
 
				+#power_smooth1 = spl1(xnew)

			
 
				+#power_smooth2 = spl2(xnew)

			
 
				+power_smooth3 = spl3(xnew)

			
 
				+

			
 
				+plt.xlabel('Recall')

			
 
				+plt.ylabel('Precision')

			
 
				+#blue_patch = mpatches.Patch(color='blue', label='Proposed')

			
 
				+#plt.legend(handles=[blue_patch])

			
 
				+#red_patch = mpatches.Patch(color='red', label='YANA')

			
 
				+#plt.legend(handles=[red_patch])

			
 
				+#ax.plot(xnew, power_smooth, 'b--', label='K =10')

			
 
				+#ax.plot(xnew, power_smooth1, 'm--', label='K=20')

			
 
				+#ax.plot(xnew, power_smooth2, 'g--', label='K=30')

			
 
				+ax.plot(xnew, power_smooth3, 'c--', label='K=40')

			
 
				+#ax.plot(xnew, power_smooth1,label='YANA')

			
 
				+ax.legend()

			
 
				+plt.title('Lastfm')

			
 
				+plt.show()

			
--- a/code/evaluation.py
+++ b/code/evaluation.py
@@ -0,0 +1,402 @@
 
				+import json

			
 
				+import matplotlib.pyplot as plt

			
 
				+import matplotlib.patches as mpatches

			
 
				+#from pprint import pprint

			
 
				+import csv

			
 
				+from collections import Counter

			
 
				+from sklearn.metrics.pairwise import cosine_similarity

			
 
				+from mlxtend.frequent_patterns import apriori

			
 
				+from mlxtend.preprocessing import TransactionEncoder

			
 
				+import pandas as pd

			
 
				+from scipy import sparse

			
 
				+import numpy as np

			
 
				+import time

			
 
				+import random

			
 
				+from scipy.interpolate import make_interp_spline, BSpline

			
 
				+import seaborn as sns

			
 
				+import matplotlib.pyplot

			
 
				+from sklearn.cluster import KMeans

			
 
				+

			
 
				+"""

			
 
				+######################DATASET INFORMATION##########################################

			
 
				+The data was collected from the music streaming service Deezer (November 2017).

			
 
				+These datasets represent friendship networks of users from 3 European countries.

			
 
				+Nodes represent the users and edges are the mutual friendships. We reindexed the

			
 
				+nodes in order to achieve a certain level of anonimity. The csv files contain the

			
 
				+edges -- nodes are indexed from 0. The json files contain the genre preferences of

			
 
				+users -- each key is a user id, the genres loved are given as lists. Genre notations

			
 
				+are consistent across users.In each dataset users could like 84 distinct genres.

			
 
				+Liked genre lists were compiled based on the liked song lists. The countries included

			
 
				+are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.

			
 
				+"""

			
 
				+start = time.time()

			
 
				+with open('RO_genres.json') as data_file:

			
 
				+    data = json.load(data_file)

			
 
				+

			
 
				+'#print(data.keys())'

			
 
				+

			
 
				+users = []                              # Users in the network who uses the service

			
 
				+items = []                              # Items liked by users in the network

			
 
				+recommendations = []                    # Recommendations generated to the users after mining frequent itemsets

			
 
				+

			
 
				+for key in data.keys():                 # Retreiving the ID of each user

			
 
				+    users.append(key)

			
 
				+

			
 
				+for val in data.values():               # Retrieving the ITEMS liked by each user in the network

			
 
				+    items.append(val)

			
 
				+

			
 
				+'#print(users)'

			
 
				+'#Users in the network, for example:'

			
 
				+#['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']

			
 
				+

			
 
				+'#print(items)'

			
 
				+'#Items liked by all te users in the network, for example:'

			
 
				+#['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],

			
 
				+

			
 
				+res = items

			
 
				+my_df = pd.DataFrame(res)

			
 
				+my_df.to_csv('out.csv', index=False, header=False)

			
 
				+'#print(my_df.head())'

			
 
				+

			
 
				+'# Transposing the items and users into Binary matrix'

			
 
				+

			
 
				+te = TransactionEncoder()

			
 
				+te_ary = te.fit(items).transform(items)

			
 
				+df = pd.DataFrame(te_ary, columns=te.columns_)

			
 
				+'#print(df.head())'

			
 
				+

			
 
				+'# prints the Binary matrix elements, for example:'

			
 
				+#    Acoustic Blues  African Music     ...      Vocal jazz  West Coast

			
 
				+# 0           False          False     ...           False       False

			
 
				+# 1           False          False     ...           False       False

			
 
				+# 2           False          False     ...           False       False

			
 
				+# 3           False          False     ...           False       False

			
 
				+# 4           False          False     ...           False       False

			
 
				+'#print(te.columns_)'

			
 
				+

			
 
				+# Resulting binary matrix to csv file

			
 
				+

			
 
				+res = df

			
 
				+my_df = pd.DataFrame(res)

			
 
				+my_df.to_csv('result.csv', index=True, header=True)

			
 
				+

			
 
				+

			
 
				+data = pd.read_csv('result.csv')

			
 
				+data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)

			
 
				+'#print(data.head())'

			
 
				+

			
 
				+'# prints the Binary matrix elements in result.csv, for example:'

			
 
				+# user  Acoustic Blues        ...      Vocal jazz  West Coast

			
 
				+# 0     0           False     ...           False       False

			
 
				+# 1     1           False     ...           False       False

			
 
				+# 2     2           False     ...           False       False

			
 
				+# 3     3           False     ...           False       False

			
 
				+# 4     4           False     ...           False       False

			
 
				+

			
 
				+

			
 
				+data_items = data.drop('user', 1)

			
 
				+

			
 
				+print('Dimension of loaded data is:', np.ndim(data_items))

			
 
				+

			
 
				+interest_group_centroids = []                               # cluster centriods on which the interest groups are formed

			
 
				+interest_groups = []                                        # Most similar items for each centroid in the interest group

			
 
				+items_len = len(data_items.columns)                         # lengh of the items in the dataset

			
 
				+length = []  # stores the index of the centroids

			
 
				+print(items_len)

			
 
				+print('\n\n#########################################CENTROIDS#####################################################\n\n')

			
 
				+

			
 
				+p = (items_len-1) // 5

			
 
				+r = p

			
 
				+length.append(p)

			
 
				+

			
 
				+for index in range(0, 4):

			
 
				+    items_len = int(round(r + p))

			
 
				+    r = items_len

			
 
				+    length.append(items_len)

			
 
				+'#print(length)'

			
 
				+'#Index of the centroid elements, for example:'

			
 
				+#[16, 32, 48, 64, 80]

			
 
				+

			
 
				+'# Calculating the centroids based on the length of the items in the DATASET: result.csv'

			
 
				+

			
 
				+for index in length:                                        # for each centroid in the length

			
 
				+    centroids = data_items.columns.values[index]

			
 
				+    interest_group_centroids.append(centroids)

			
 
				+#print('The Centroids are = ', interest_group_centroids, '\n\n')

			
 
				+#For example: The Centroids are =  ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']

			
 
				+

			
 
				+print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')

			
 
				+start_time = time.time()

			
 
				+'# As a first step we normalize the user vectors to unit vectors.'

			
 
				+

			
 
				+magnitude = np.sqrt(np.square(data_items).sum(axis=1))

			
 
				+data_items = data_items.divide(magnitude, axis='index')

			
 
				+

			
 
				+'#print(data_items.head(5))'

			
 
				+

			
 
				+

			
 
				+def calculate_similarity(data_items):

			
 
				+    data_sparse = sparse.csr_matrix(data_items)

			
 
				+    similarities = cosine_similarity(data_sparse.transpose())

			
 
				+    '#print(similarities)'

			
 
				+    sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)

			
 
				+    return sim

			
 
				+

			
 
				+'# Build the similarity matrix'

			
 
				+data_matrix = calculate_similarity(data_items)

			
 
				+'#print(data_matrix.head())'

			
 
				+end_time = time.time()

			
 
				+print("the similarity computation time is--- %s seconds ---" % (end_time - start_time))

			
 
				+

			
 
				+

			
 
				+#''prints the item-item similarity matrix for all items in DATASET, for example:'

			
 
				+#                      Acoustic Blues     ...      West Coast

			
 
				+# Acoustic Blues             1.000000     ...        0.000000

			
 
				+# African Music              0.044191     ...        0.005636

			
 
				+# Alternative                0.008042     ...        0.028171

			
 
				+# Alternative Country        0.037340     ...        0.011230

			
 
				+# Asian Music                0.000000     ...        0.004623

			
 
				+

			
 
				+

			
 
				+print('\n\n#########################################INTEREST GROUPS###############################################\n\n')

			
 
				+

			
 
				+

			
 
				+for i in interest_group_centroids:

			
 
				+    Interest_group = data_matrix.loc[i].nlargest(p).index.values

			
 
				+    print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')

			
 
				+    interest_groups.append(Interest_group)

			
 
				+

			
 
				+sim_clusuters = len(set(interest_groups[1]).intersection(interest_groups[2])) / len(set(interest_groups[1]).union(interest_groups[2]))

			
 
				+print(sim_clusuters)

			
 
				+print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')

			
 
				+

			
 
				+start_time = time.time()

			
 
				+d = apriori(df, min_support=0.8, use_colnames=True, max_len=2)

			
 
				+print((d['itemsets']))

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+print('#############################################USERS & THEIR LIKES###########################################\n\n')

			
 
				+

			
 
				+user = [2222]     # The id of the user for whom we want to generate recommendations

			
 
				+

			
 
				+user_index = data[data.user == user].index.tolist()[0] # Get the frame index

			
 
				+    #print('user index is: ', user_index)'

			
 
				+known_user_likes = data_items.ix[user_index]

			
 
				+known_user_likes = known_user_likes[known_user_likes > 0].index.values

			
 
				+print('user', user_index, 'likes', known_user_likes, '\n')

			
 
				+

			
 
				+

			
 
				+

			
 
				+groups = random.sample(data.user.tolist(), 20)

			
 
				+print(groups)

			
 
				+

			
 
				+user2 = groups     # The id of the user for whom we want to generate recommendations

			
 
				+left = []

			
 
				+right = []

			
 
				+R = []

			
 
				+precision_y = []

			
 
				+recall_x = []

			
 
				+

			
 
				+for i in user2:

			
 
				+    user_index = data[data.user == i].index.tolist()[0]  # Get the frame index

			
 
				+    # print('user index is: ', user_index)'

			
 
				+    known_user_likes = data_items.ix[user_index]

			
 
				+    known_user_likes = known_user_likes[known_user_likes > 0].index.values

			
 
				+    print('user', user_index, 'likes', known_user_likes, '\n')

			
 
				+

			
 
				+

			
 
				+    for i in range(0, len(d['itemsets'])):

			
 
				+        f_s = d['itemsets'][i]

			
 
				+        # print('Recommendation', i, 'is: ', f_s

			
 
				+        LHS = f_s

			
 
				+        RHS = f_s

			
 
				+        l, *_ = LHS

			
 
				+        *_, r = RHS

			
 
				+        # print(l)

			
 
				+        left.append(l)

			
 
				+        right.append(r)

			
 
				+        # for index in range(1, len(Selected_users_association_IG)):

			
 
				+        # if l in set(Selected_users_association_IG[index]):

			
 
				+        # print(l,'exist')# LHS in user and if LHS present recommend

			
 
				+        if l in set(known_user_likes):

			
 
				+            print('user', user_index, 'gets recommendation:', r)

			
 
				+            R.append(r)

			
 
				+            precision = len(set(known_user_likes).intersection(set(RHS))) / len(set(RHS))

			
 
				+            Recall = len(set(known_user_likes).intersection(set(RHS))) / len(known_user_likes)

			
 
				+            print('precision of user:', user_index, 'is', precision)

			
 
				+            print('Recall of user:', user_index, 'is', Recall)

			
 
				+    precision_y.append(precision)

			
 
				+    recall_x.append(Recall)

			
 
				+

			
 
				+print(precision_y)

			
 
				+print(recall_x)

			
 
				+

			
 
				+"""

			
 
				+x = [0.2, 0.4, 0.6, 0.8, 1.0]

			
 
				+y = [1.0, 0.75, 0.5, 0.25]

			
 
				+#Y10 = [1, 0.6, 0.4, 0.3, 0.2]

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+Y40 = [1.0, 0.3, 0.2, 0.1, 0]

			
 
				+

			
 
				+Yana = []

			
 
				+plt.plot(x, Y40)

			
 
				+plt.xlabel('Recall')

			
 
				+plt.ylabel('Precision')

			
 
				+plt.yscale('linear')

			
 
				+plt.grid(False)

			
 
				+plt.show()

			
 
				+

			
 
				+"""

			
 
				+

			
 
				+

			
 
				+print('#############################################Accuracy plot###########################################\n\n')

			
 
				+x_new = np.asarray([0.2, 0.4, 0.6, 0.8, 1.0])

			
 
				+

			
 
				+Y10 = [1.01, 0.81, 0.5, 0.4, 0.2]

			
 
				+Y20 = [1.02, 0.80, 0.51, 0.41, 0.23]

			
 
				+Y30 = [1.03, 0.82, 0.49, 0.41, 0.19]

			
 
				+Y40 = [1.04, 0.79, 0.53, 0.43, 0.22]

			
 
				+YANA_music = [0.71, 0.5, 0.4, 0.3, 0.2]

			
 
				+YANA_TvEnt = [0.82, 0.6, 0.5, 0.3, 0.2]

			
 
				+YANA_movie = [0.71, 0.5, 0.4, 0.2, 0.1]

			
 
				+YANA_doctor = [0.72, 0.4, 0.3, 0.2, 0.1]

			
 
				+

			
 
				+"""

			
 
				+fig = plt.figure()

			
 
				+ax = plt.subplot(111)

			
 
				+

			
 
				+xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+spl = make_interp_spline(x_new, Y40, k=2)#BSpline object

			
 
				+#spl1 = make_interp_spline(x_new, YANA_doctor, k=3)#BSpline object

			
 
				+power_smooth = spl(xnew)

			
 
				+#power_smooth1 = spl1(xnew)

			
 
				+plt.xlabel('Recall')

			
 
				+plt.ylabel('Precision')

			
 
				+#blue_patch = mpatches.Patch(color='blue', label='Proposed')

			
 
				+#plt.legend(handles=[blue_patch])

			
 
				+#red_patch = mpatches.Patch(color='red', label='YANA')

			
 
				+#plt.legend(handles=[red_patch])

			
 
				+ax.plot(xnew, power_smooth, 'c--', label='K = 40')

			
 
				+#ax.plot(xnew, power_smooth1,label='YANA')

			
 
				+ax.legend()

			
 
				+plt.title('Deezer')

			
 
				+plt.show()

			
 
				+

			
 
				+#Similarity = 1 - (len(set(y).intersection(set(Y40))) / len(set(y).union(set(Y40)))) # measures similarity between sets

			
 
				+#print(Similarity)

			
 
				+"""

			
 
				+

			
 
				+print('#############################################deezer group plot###########################################\n\n')

			
 
				+fig = plt.figure()

			
 
				+ax = plt.subplot(111)

			
 
				+

			
 
				+xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+#spl = make_interp_spline(x_new, Y10, k=2)#BSpline object

			
 
				+#spl1 = make_interp_spline(x_new, Y20, k=2)#BSpline object

			
 
				+#spl2 = make_interp_spline(x_new, Y30, k=2)#BSpline object

			
 
				+spl3 = make_interp_spline(x_new, Y40, k=2)#BSpline object

			
 
				+#power_smooth = spl(xnew)

			
 
				+#power_smooth1 = spl1(xnew)#

			
 
				+#power_smooth2 = spl2(xnew)

			
 
				+power_smooth3 = spl3(xnew)

			
 
				+plt.xlabel('Recall')

			
 
				+plt.ylabel('Precision')

			
 
				+#blue_patch = mpatches.Patch(color='blue', label='Proposed')

			
 
				+#plt.legend(handles=[blue_patch])

			
 
				+#red_patch = mpatches.Patch(color='red', label='YANA')

			
 
				+#plt.legend(handles=[red_patch])

			
 
				+#ax.plot(xnew, power_smooth, 'b--', label='K=10')

			
 
				+#ax.plot(xnew, power_smooth1, 'm--', label='K=20')

			
 
				+#ax.plot(xnew, power_smooth2, 'g--', label='K=30')

			
 
				+ax.plot(xnew, power_smooth3, 'c--', label='K=40')

			
 
				+ax.legend()

			
 
				+plt.title('Deezer')

			
 
				+plt.show()

			
 
				+

			
 
				+"""

			
 
				+print('#############################################Similarity plot###########################################\n\n')

			
 
				+

			
 
				+x_new1 = np.asarray([50, 80, 150, 200])

			
 
				+xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+Sim_time = [0.2, 0.4, 0.7, 0.95]

			
 
				+spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object

			
 
				+power_smooth2 = spl(xnew1)

			
 
				+plt.title('Computation cost of similarity calculation')

			
 
				+plt.xlabel('Items')

			
 
				+plt.ylabel('Time (in seconds)')

			
 
				+plt.plot(xnew1, power_smooth2)

			
 
				+plt.show()

			
 
				+"""

			
 
				+

			
 
				+print('#############################################Recommendation plot###########################################\n\n')

			
 
				+"""

			
 
				+x_new1 = np.asarray([50, 80, 150, 200])

			
 
				+xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+Sim_time = [0.17, 0.30, 0.53, 0.71]

			
 
				+spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object

			
 
				+power_smooth2 = spl(xnew1)

			
 
				+plt.title('Computation cost of recommendation generation')

			
 
				+plt.xlabel('Number of items')

			
 
				+plt.ylabel('Time (in seconds)')

			
 
				+plt.plot(xnew1, power_smooth2)

			
 
				+plt.show()

			
 
				+"""

			
 
				+

			
 
				+print('#############################################comparision rec_sim###########################################\n\n')

			
 
				+"""

			
 
				+x_new1 = np.asarray([50, 80, 150, 200])

			
 
				+xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+Sim_time = [0.17, 0.30, 0.53, 0.71]

			
 
				+

			
 
				+spl = make_interp_spline(x_new1, Sim_time, k=3)#BSpline object

			
 
				+#spl1 = make_interp_spline(x_new, , k=3)#BSpline object

			
 
				+

			
 
				+power_smooth2 = spl(xnew1)

			
 
				+plt.title('Computation cost of recommendation generation')

			
 
				+plt.xlabel('Number of items')

			
 
				+plt.ylabel('Time (in seconds)')

			
 
				+plt.plot(xnew1, power_smooth2)

			
 
				+plt.show()

			
 
				+

			
 
				+

			
 
				+total_time = time.time() - start

			
 
				+print(total_time)

			
 
				+"""

			
 
				+"""

			
 
				+x_new1 = np.asarray([2, 3, 4, 5])

			
 
				+xnew1 = np.linspace(x_new1.min(), x_new1.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+Sim_cluster = [0.6, 0.3, 0.29, 0.32]

			
 
				+spl = make_interp_spline(x_new1, Sim_cluster, k=3)#BSpline object

			
 
				+power_smooth2 = spl(xnew1)

			
 
				+plt.title('Interest group cluster analysis')

			
 
				+plt.xlabel('Interest groups k')

			
 
				+plt.ylabel('Similarity')

			
 
				+plt.plot(xnew1, power_smooth2)

			
 
				+plt.show()

			
 
				+"""

			
 
				+print('#############################################comparision with yana###########################################\n\n')

			
 
				+fig = plt.figure()

			
 
				+ax = plt.subplot(111)

			
 
				+xnew = np.linspace(x_new.min(), x_new.max(), 300) #300 represents number of points to make between T.min and T.max

			
 
				+spl = make_interp_spline(x_new, Y40, k=2)#BSpline object

			
 
				+spl1 = make_interp_spline(x_new, YANA_TvEnt, k=2)#BSpline object

			
 
				+

			
 
				+power_smooth = spl(xnew)

			
 
				+power_smooth1 = spl1(xnew)

			
 
				+

			
 
				+plt.xlabel('Recall')

			
 
				+plt.ylabel('Precision')

			
 
				+#blue_patch = mpatches.Patch(color='blue', label='Proposed')

			
 
				+#plt.legend(handles=[blue_patch])

			
 
				+#red_patch = mpatches.Patch(color='red', label='YANA')

			
 
				+#plt.legend(handles=[red_patch])

			
 
				+ax.plot(xnew, power_smooth, 'b--', label='Deezer')

			
 
				+ax.plot(xnew, power_smooth1, 'r--', label='Yana')

			
 
				+

			
 
				+ax.legend()

			
 
				+plt.title('TvEnt')

			
 
				+plt.show()
			
--- a/code/lastfm.csv
+++ b/code/lastfm.csv
--- a/code/test.py
+++ b/code/test.py
@@ -0,0 +1,34 @@
 
				+import time

			
 
				+import pandas as pd

			
 
				+import multiprocessing as mp

			
 
				+import numpy as np

			
 
				+import psutil

			
 
				+import os

			
 
				+import distance

			
 
				+import sklearn.cluster

			
 
				+

			
 
				+num_cores = mp.cpu_count()

			
 
				+print("The kernal has",num_cores, "cores and you can find information regarding mermory usage in",

			
 
				+      psutil.virtual_memory())

			
 
				+start_time = time.time()

			
 
				+items = pd.read_csv("product.csv")

			
 
				+print(os.path.getsize('product.csv'))

			
 
				+

			
 
				+items = np.asarray(items)

			
 
				+merged_items = np.concatenate(items, axis=0)

			
 
				+

			
 
				+lev_similarity = -1*np.array([[distance.levenshtein(w1, w2)

			
 
				+            for w1 in merged_items] for w2 in merged_items])

			
 
				+

			
 
				+affprop = sklearn.cluster.AffinityPropagation(affinity="euclidean", damping=0.5, max_iter=200)

			
 
				+affprop.fit(lev_similarity)

			
 
				+for cluster_id in np.unique(affprop.labels_):

			
 
				+    exemplar = merged_items[affprop.cluster_centers_indices_[cluster_id]]

			
 
				+    cluster = np.unique(merged_items[np.nonzero(affprop.labels_ == cluster_id)])

			
 
				+    cluster_str = ", ".join(cluster)

			
 
				+    print(" - *%s:* %s" % (exemplar, cluster_str))

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+

			
 
				+

			
--- a/code/test2.py
+++ b/code/test2.py
@@ -0,0 +1,37 @@
 
				+import json

			
 
				+import matplotlib.pyplot as plt

			
 
				+import matplotlib.patches as mpatches

			
 
				+#from pprint import pprint

			
 
				+import csv

			
 
				+from collections import Counter

			
 
				+from sklearn.metrics.pairwise import cosine_similarity

			
 
				+from mlxtend.frequent_patterns import apriori

			
 
				+from mlxtend.preprocessing import TransactionEncoder

			
 
				+import pandas as pd

			
 
				+from scipy import sparse

			
 
				+import numpy as np

			
 
				+import time

			
 
				+import random

			
 
				+from scipy.interpolate import make_interp_spline, BSpline

			
 
				+

			
 
				+

			
 
				+

			
 
				+data = pd.read_csv('lastfm.csv')

			
 
				+

			
 
				+df = data.drop('user', 1)

			
 
				+

			
 
				+conv_df = df.astype(bool)

			
 
				+

			
 
				+start_time = time.time()

			
 
				+

			
 
				+d = apriori(conv_df, min_support=0.01, use_colnames=True, max_len=2)

			
 
				+print((d['itemsets']))

			
 
				+

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+interest_group_centroids = []                               # cluster centriods on which the interest groups are formed

			
 
				+interest_groups = []                                        # Most similar items for each centroid in the interest group

			
 
				+items_len = len(df.columns)                         # lengh of the items in the dataset

			
 
				+length = []  # stores the index of the centroids

			
 
				+print(items_len)

			
--- a/code/test2_kmeans.py
+++ b/code/test2_kmeans.py
@@ -0,0 +1,45 @@
 
				+from sklearn.feature_extraction.text import TfidfVectorizer

			
 
				+from sklearn.cluster import KMeans

			
 
				+import time

			
 
				+import pandas as pd

			
 
				+import gensim

			
 
				+from gensim.models import Doc2Vec

			
 
				+import multiprocessing as mp

			
 
				+import numpy as np

			
 
				+import psutil

			
 
				+import os

			
 
				+import distance

			
 
				+import sklearn.cluster

			
 
				+from sklearn.metrics import adjusted_rand_score

			
 
				+

			
 
				+items = pd.read_csv("product.csv")

			
 
				+items = np.asarray(items)

			
 
				+documents = np.concatenate(items, axis=0)

			
 
				+

			
 
				+vectorizer = TfidfVectorizer(stop_words='english')

			
 
				+X = vectorizer.fit_transform(documents)

			
 
				+

			
 
				+

			
 
				+true_k = 2

			
 
				+model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

			
 
				+model.fit(X)

			
 
				+

			
 
				+print("Top terms per cluster:")

			
 
				+order_centroids = model.cluster_centers_.argsort()[:, ::-1]

			
 
				+terms = vectorizer.get_feature_names()

			
 
				+for i in range(true_k):

			
 
				+    print("Interest group %d:" % i),

			
 
				+    for ind in order_centroids[i, :50]:

			
 
				+        print(' %s' % terms[ind]),

			
 
				+    print

			
 
				+

			
 
				+print("\n")

			
 
				+print("Prediction")

			
 
				+

			
 
				+Y = vectorizer.transform(["hardrock", "movies", "music",])

			
 
				+prediction = model.predict(Y)

			
 
				+print(prediction)

			
 
				+

			
 
				+Y = vectorizer.transform(["cheese", "sports", "football","Yogurt"])

			
 
				+prediction = model.predict(Y)

			
 
				+print(prediction)
			
--- a/code/test3_kmeans.py
+++ b/code/test3_kmeans.py
@@ -0,0 +1,78 @@
 
				+import numpy as np

			
 
				+import time

			
 
				+import pandas as pd

			
 
				+import matplotlib.pyplot as plt

			
 
				+import seaborn as sns

			
 
				+from sklearn.feature_extraction import text

			
 
				+from sklearn.feature_extraction.text import TfidfVectorizer

			
 
				+from sklearn.cluster import KMeans

			
 
				+from nltk.tokenize import RegexpTokenizer

			
 
				+from nltk.stem.snowball import SnowballStemmer

			
 
				+import psutil

			
 
				+import os

			
 
				+import multiprocessing as mp

			
 
				+

			
 
				+

			
 
				+num_cores = mp.cpu_count()

			
 
				+print("The kernal has",num_cores, "cores and you can find information regarding memory usage in",

			
 
				+      psutil.virtual_memory())

			
 
				+

			
 
				+items = pd.read_csv("product.csv")

			
 
				+print(items.head())

			
 
				+print(items.info())

			
 
				+items[items['Items'].duplicated(keep=False)].sort_values('Items').head(8)

			
 
				+items = items.drop_duplicates('Items')

			
 
				+start_time = time.time()

			
 
				+

			
 
				+punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]

			
 
				+stop_words = text.ENGLISH_STOP_WORDS #commonly used words to ignore (such as and,or,is,etc)

			
 
				+desc = items['Items'].values

			
 
				+vectorizer = TfidfVectorizer(stop_words=stop_words)

			
 
				+X = vectorizer.fit_transform(desc)

			
 
				+

			
 
				+word_features = vectorizer.get_feature_names()

			
 
				+print(len(word_features))

			
 
				+print(word_features[10000:10002])

			
 
				+

			
 
				+stemmer = SnowballStemmer('english')

			
 
				+tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

			
 
				+

			
 
				+def tokenize(text):

			
 
				+    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

			
 
				+

			
 
				+vectorizer2 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize)

			
 
				+X2 = vectorizer2.fit_transform(desc)

			
 
				+word_features2 = vectorizer2.get_feature_names()

			
 
				+print(len(word_features2))

			
 
				+print(word_features2[:10])

			
 
				+

			
 
				+vectorizer3 = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, max_features=1000)

			
 
				+X3 = vectorizer3.fit_transform(desc)

			
 
				+words = vectorizer3.get_feature_names()

			
 
				+

			
 
				+size = []

			
 
				+for i in range(1,11):

			
 
				+    kmeans = KMeans(n_clusters=i,init='k-means++', max_iter=300, n_init=10, random_state=0)

			
 
				+    kmeans.fit(X3)

			
 
				+    size.append(kmeans.inertia_)

			
 
				+plt.plot(range(1,11), size)

			
 
				+plt.title('The Elbow Method')

			
 
				+plt.xlabel('Number of clusters')

			
 
				+plt.ylabel('WCSS')

			
 
				+plt.savefig('elbow.png')

			
 
				+plt.show()

			
 
				+

			
 
				+print(words[250:300])

			
 
				+

			
 
				+# n_init(number of iterations for clustering) n_jobs(number of cpu cores to use)

			
 
				+kmeans = KMeans(n_clusters=3, n_init=20, n_jobs=2)

			
 
				+kmeans.fit(X3)

			
 
				+# We look at 3 the clusters generated by k-means.

			
 
				+common_words = kmeans.cluster_centers_.argsort()[:, -1:-26:-1]

			
 
				+print(common_words)

			
 
				+for num, centroid in enumerate(common_words):

			
 
				+    print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+

			
--- a/code/test5_word2vec.py
+++ b/code/test5_word2vec.py
@@ -0,0 +1,131 @@
 
				+import pandas as pd

			
 
				+import numpy as np

			
 
				+from pprint import pprint

			
 
				+from collections import Counter

			
 
				+#import re

			
 
				+#from sklearn.metrics.pairwise import pairwise_distances

			
 
				+from sklearn.metrics.pairwise import cosine_similarity

			
 
				+from scipy import sparse

			
 
				+#from sklearn.preprocessing import OneHotEncoder

			
 
				+

			
 
				+interest_group_centroids = []                               # cluster centriods on which the interest groups are formed

			
 
				+interest_groups = []                                        # Most similar items for each centroid in the interest group

			
 
				+data = pd.read_csv('lastfm.csv')                            # Reading the CSV file

			
 
				+#print(data)

			
 
				+

			
 
				+# Create a new dataframe without the user ids.

			
 
				+data_items = data.drop('user', 1)                           # Drop the user column for item-item similarity calculation

			
 
				+print('Dimension of loaded data is:', np.ndim(data_items))  #Dimension of the loaded data

			
 
				+items_len = len(data_items.columns)                         #lengh of the items in the dataset

			
 
				+length = []                                                 #stores the index of the centroids

			
 
				+

			
 
				+

			
 
				+print('\n\n#########################################CENTROIDS#####################################################\n\n')

			
 
				+

			
 
				+p = (items_len-1) // 5

			
 
				+r = p

			
 
				+length.append(p)

			
 
				+

			
 
				+for index in range(0, 4):

			
 
				+    items_len = int(round(r + p))

			
 
				+    r = items_len

			
 
				+    length.append(items_len)

			
 
				+print(length)

			
 
				+

			
 
				+

			
 
				+for index in length:

			
 
				+    centroids = data_items.columns.values[index]

			
 
				+    interest_group_centroids.append(centroids)

			
 
				+print('The Centroids are = ', interest_group_centroids, '\n\n')

			
 
				+

			
 
				+

			
 
				+'############SIMILARITY#################'

			
 
				+

			
 
				+

			
 
				+'# As a first step we normalize the user vectors to unit vectors.'

			
 
				+

			
 
				+magnitude = np.sqrt(np.square(data_items).sum(axis=1))

			
 
				+data_items = data_items.divide(magnitude, axis='index')

			
 
				+

			
 
				+'#print(data_items.head(5))'

			
 
				+

			
 
				+

			
 
				+def calculate_similarity(data_items):

			
 
				+    data_sparse = sparse.csr_matrix(data_items)

			
 
				+    similarities = cosine_similarity(data_sparse.transpose())

			
 
				+    #print(similarities)

			
 
				+    sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)

			
 
				+    return sim

			
 
				+'# Build the similarity matrix'

			
 
				+data_matrix = calculate_similarity(data_items)

			
 
				+'# Lets get the top 11 similar artists for Beyonce'

			
 
				+

			
 
				+

			
 
				+print('##############INTEREST GROUPS##################\n\n')

			
 
				+

			
 
				+

			
 
				+for i in interest_group_centroids:

			
 
				+    Interest_group = data_matrix.loc[i].nlargest(p).index.values

			
 
				+    print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')

			
 
				+    interest_groups.append(Interest_group)

			
 
				+#print(interest_groups)

			
 
				+

			
 
				+print('###############USERS###################\n\n')

			
 
				+

			
 
				+

			
 
				+user = 19695                                           # The id of the user for whom we want to generate recommendations

			
 
				+user_index = data[data.user == user].index.tolist()[0] # Get the frame index

			
 
				+#print('user index is: ', user_index)

			
 
				+known_user_likes = data_items.ix[user_index]

			
 
				+known_user_likes = known_user_likes[known_user_likes > 0].index.values

			
 
				+

			
 
				+print('user', user_index, 'likes', known_user_likes, '\n')

			
 
				+

			
 
				+print('###############USERS ASSOCIATION###################\n\n')

			
 
				+

			
 
				+for i in interest_groups:

			
 
				+    a_vals = Counter(i)

			
 
				+    b_vals = Counter(known_user_likes)

			
 
				+

			
 
				+    # convert to word-vectors

			
 
				+    words = list(a_vals.keys() | b_vals.keys())

			
 
				+    a_vect = [a_vals.get(word, 0) for word in words]

			
 
				+    b_vect = [b_vals.get(word, 0) for word in words]

			
 
				+    # find cosine

			
 
				+    len_a = sum(av * av for av in a_vect) ** 0.5

			
 
				+    len_b = sum(bv * bv for bv in b_vect) ** 0.5

			
 
				+    dot = sum(av * bv for av, bv in zip(a_vect, b_vect))

			
 
				+    cosine = dot / (len_a * len_b)

			
 
				+

			
 
				+    if cosine == 0:

			
 
				+        pass

			
 
				+    else:

			
 
				+        print('User:', user_index, 'is associated to the Interest group with similarity:', cosine)

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+# def jaccard_similarity_score(df):

			
 
				+# #     """Calculate the column-wise cosine similarity for a sparse

			
 
				+# #     matrix. Return a new dataframe matrix with similarities.

			
 
				+# #     """

			
 
				+#       data_sparse = sparse.csr_matrix(df)

			
 
				+#       similarities = jaccard_similarity_score(data_sparse.transpose())

			
 
				+#       similarities = 1 - pairwise_distances(df.T, metric='hamming')

			
 
				+#       sim = pd.DataFrame(data=similarities, index=df.columns, columns=df.columns)

			
 
				+#       return sim

			
 
				+# #

			
 
				+# data_matrix2 = jaccard_similarity_score(df)

			
 
				+# print(data_matrix2)

			
 
				+# # #print(data_matrix.loc['aerosmith'].nlargest(6))

			
 
				+

			
 
				+

			
 
				+

			
 
				+# kmeans = KMeans(n_clusters=2, n_init=20, n_jobs=2)

			
 
				+# kmeans.fit(data_matrix)

			
 
				+# # We look at 3 the clusters generated by k-means.

			
 
				+# common_words = kmeans.cluster_centers_[:1]

			
 
				+# print(common_words)

			
 
				+#     #for num, centroid in enumerate(common_words):

			
 
				+#     #print("Interest group", str(num) + ' : ' + ', '.join(words[word] for word in centroid))
			
--- a/code/transpose.py
+++ b/code/transpose.py
@@ -0,0 +1,260 @@
 
				+import json

			
 
				+import matplotlib.pyplot as plt

			
 
				+#from pprint import pprint

			
 
				+import csv

			
 
				+from collections import Counter

			
 
				+from sklearn.metrics.pairwise import cosine_similarity

			
 
				+from mlxtend.frequent_patterns import apriori

			
 
				+from mlxtend.preprocessing import TransactionEncoder

			
 
				+import pandas as pd

			
 
				+from scipy import sparse

			
 
				+import numpy as np

			
 
				+import time

			
 
				+

			
 
				+

			
 
				+"""

			
 
				+######################DATASET INFORMATION##########################################

			
 
				+The data was collected from the music streaming service Deezer (November 2017).

			
 
				+These datasets represent friendship networks of users from 3 European countries.

			
 
				+Nodes represent the users and edges are the mutual friendships. We reindexed the

			
 
				+nodes in order to achieve a certain level of anonimity. The csv files contain the

			
 
				+edges -- nodes are indexed from 0. The json files contain the genre preferences of

			
 
				+users -- each key is a user id, the genres loved are given as lists. Genre notations

			
 
				+are consistent across users.In each dataset users could like 84 distinct genres.

			
 
				+Liked genre lists were compiled based on the liked song lists. The countries included

			
 
				+are Romania, Croatia and Hungary. For each dataset we listed the number of nodes an edges.

			
 
				+"""

			
 
				+

			
 
				+with open('RO_genres.json') as data_file:

			
 
				+    data = json.load(data_file)

			
 
				+

			
 
				+'#print(data.keys())'

			
 
				+

			
 
				+users = []                              # Users in the network who uses the service

			
 
				+items = []                              # Items liked by users in the network

			
 
				+recommendations = []                    # Recommendations generated to the users after mining frequent itemsets

			
 
				+

			
 
				+for key in data.keys():                 # Retreiving the ID of each user

			
 
				+    users.append(key)

			
 
				+

			
 
				+for val in data.values():               # Retrieving the ITEMS liked by each user in the network

			
 
				+    items.append(val)

			
 
				+

			
 
				+'#print(users)'

			
 
				+'#Users in the network, for example:'

			
 
				+#['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18',...,'41772']

			
 
				+

			
 
				+'#print(items)'

			
 
				+'#Items liked by all te users in the network, for example:'

			
 
				+#['Dance', 'Soul & Funk', 'Pop', 'Musicals', 'Contemporary R&B', 'Indie Pop', 'Alternative'],

			
 
				+

			
 
				+res = items

			
 
				+my_df = pd.DataFrame(res)

			
 
				+my_df.to_csv('out.csv', index=False, header=False)

			
 
				+'#print(my_df.head())'

			
 
				+

			
 
				+'# Transposing the items and users into Binary matrix'

			
 
				+

			
 
				+te = TransactionEncoder()

			
 
				+te_ary = te.fit(items).transform(items)

			
 
				+df = pd.DataFrame(te_ary, columns=te.columns_)

			
 
				+'#print(df.head())'

			
 
				+

			
 
				+'# prints the Binary matrix elements, for example:'

			
 
				+#    Acoustic Blues  African Music     ...      Vocal jazz  West Coast

			
 
				+# 0           False          False     ...           False       False

			
 
				+# 1           False          False     ...           False       False

			
 
				+# 2           False          False     ...           False       False

			
 
				+# 3           False          False     ...           False       False

			
 
				+# 4           False          False     ...           False       False

			
 
				+'#print(te.columns_)'

			
 
				+

			
 
				+# Resulting binary matrix to csv file

			
 
				+

			
 
				+res = df

			
 
				+my_df = pd.DataFrame(res)

			
 
				+my_df.to_csv('result.csv', index=True, header=True)

			
 
				+

			
 
				+

			
 
				+data = pd.read_csv('result.csv')

			
 
				+data.rename(columns={'Unnamed: 0': 'user'}, inplace=True)

			
 
				+'#print(data.head())'

			
 
				+

			
 
				+'# prints the Binary matrix elements in result.csv, for example:'

			
 
				+# user  Acoustic Blues        ...      Vocal jazz  West Coast

			
 
				+# 0     0           False     ...           False       False

			
 
				+# 1     1           False     ...           False       False

			
 
				+# 2     2           False     ...           False       False

			
 
				+# 3     3           False     ...           False       False

			
 
				+# 4     4           False     ...           False       False

			
 
				+

			
 
				+

			
 
				+data_items = data.drop('user', 1)

			
 
				+

			
 
				+print('Dimension of loaded data is:', np.ndim(data_items))

			
 
				+

			
 
				+interest_group_centroids = []                               # cluster centriods on which the interest groups are formed

			
 
				+interest_groups = []                                        # Most similar items for each centroid in the interest group

			
 
				+items_len = len(data_items.columns)                         # lengh of the items in the dataset

			
 
				+length = []  # stores the index of the centroids

			
 
				+print(items_len)

			
 
				+print('\n\n#########################################CENTROIDS#####################################################\n\n')

			
 
				+

			
 
				+p = (items_len-1) // 6

			
 
				+r = p

			
 
				+length.append(p)

			
 
				+

			
 
				+for index in range(0, 3):

			
 
				+    items_len = int(round(r + p))

			
 
				+    r = items_len

			
 
				+    length.append(items_len)

			
 
				+'#print(length)'

			
 
				+'#Index of the centroid elements, for example:'

			
 
				+#[16, 32, 48, 64, 80]

			
 
				+

			
 
				+'# Calculating the centroids based on the length of the items in the DATASET: result.csv'

			
 
				+

			
 
				+for index in length:                                        # for each centroid in the length

			
 
				+    centroids = data_items.columns.values[index]

			
 
				+    interest_group_centroids.append(centroids)

			
 
				+#print('The Centroids are = ', interest_group_centroids, '\n\n')

			
 
				+#For example: The Centroids are =  ['Comedy', 'Electro Hip Hop', 'Jazz Hip Hop', 'Rap/Hip Hop', 'Tropical']

			
 
				+

			
 
				+print('\n\n#########################################ITEM-ITEM_SIMILARITY##########################################\n\n')

			
 
				+start_time = time.time()

			
 
				+'# As a first step we normalize the user vectors to unit vectors.'

			
 
				+

			
 
				+magnitude = np.sqrt(np.square(data_items).sum(axis=1))

			
 
				+data_items = data_items.divide(magnitude, axis='index')

			
 
				+

			
 
				+'#print(data_items.head(5))'

			
 
				+

			
 
				+

			
 
				+def calculate_similarity(data_items):

			
 
				+    data_sparse = sparse.csr_matrix(data_items)

			
 
				+    similarities = cosine_similarity(data_sparse.transpose())

			
 
				+    '#print(similarities)'

			
 
				+    sim = pd.DataFrame(data=similarities, index=data_items.columns, columns=data_items.columns)

			
 
				+    return sim

			
 
				+

			
 
				+'# Build the similarity matrix'

			
 
				+data_matrix = calculate_similarity(data_items)

			
 
				+'#print(data_matrix.head())'

			
 
				+

			
 
				+#''prints the item-item similarity matrix for all items in DATASET, for example:'

			
 
				+#                      Acoustic Blues     ...      West Coast

			
 
				+# Acoustic Blues             1.000000     ...        0.000000

			
 
				+# African Music              0.044191     ...        0.005636

			
 
				+# Alternative                0.008042     ...        0.028171

			
 
				+# Alternative Country        0.037340     ...        0.011230

			
 
				+# Asian Music                0.000000     ...        0.004623

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+print('\n\n#########################################INTEREST GROUPS###############################################\n\n')

			
 
				+

			
 
				+

			
 
				+for i in interest_group_centroids:

			
 
				+    Interest_group = data_matrix.loc[i].nlargest(p).index.values

			
 
				+    print('Interest group', interest_group_centroids.index(i), ' = ', Interest_group, '\n')

			
 
				+    interest_groups.append(Interest_group)

			
 
				+'#print(interest_groups)'

			
 
				+

			
 
				+print(set(interest_groups[1]).intersection(interest_groups[3]))

			
 
				+

			
 
				+print('\n\n#######################FREQUENT-ITEMSETS_APRIORI#######################################################\n\n')

			
 
				+

			
 
				+start_time = time.time()

			
 
				+d = apriori(df, min_support=0.2, use_colnames=True, max_len=5)

			
 
				+print((d['itemsets']))

			
 
				+

			
 
				+print("--- %s seconds ---" % (time.time() - start_time))

			
 
				+

			
 
				+print('#############################################USERS & THEIR LIKES###########################################\n\n')

			
 
				+

			
 
				+user = [2222]     # The id of the user for whom we want to generate recommendations

			
 
				+user_index = data[data.user == user].index.tolist()[0] # Get the frame index

			
 
				+#print('user index is: ', user_index)'

			
 
				+known_user_likes = data_items.ix[user_index]

			
 
				+known_user_likes = known_user_likes[known_user_likes > 0].index.values

			
 
				+print('user', user_index, 'likes', known_user_likes, '\n')

			
 
				+

			
 
				+

			
 
				+print('#############################################USERS ASSOCIATION TO INTEREST GROUPS##########################\n\n')

			
 
				+

			
 
				+

			
 
				+# for i in interest_groups:

			
 
				+#     a_vals = Counter(i)

			
 
				+#     b_vals = Counter(known_user_likes)

			
 
				+#

			
 
				+#     # convert to word-vectors, for Example:

			
 
				+#     # [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

			
 
				+#     # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

			
 
				+#     # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

			
 
				+#     # [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

			
 
				+#

			
 
				+#     words = list(a_vals.keys() | b_vals.keys())

			
 
				+#     a_vect = [a_vals.get(word, 0) for word in words]

			
 
				+#     b_vect = [b_vals.get(word, 0) for word in words]

			
 
				+#     # find cosine

			
 
				+#     len_a = sum(av * av for av in a_vect) ** 0.5

			
 
				+#     len_b = sum(bv * bv for bv in b_vect) ** 0.5

			
 
				+#     dot = sum(av * bv for av, bv in zip(a_vect, b_vect))

			
 
				+#     cosine = dot / (len_a * len_b)

			
 
				+#

			
 
				+#     if cosine == 0:

			
 
				+#         pass

			
 
				+#     else:

			
 
				+#         #print('User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', cosine)

			
 
				+#         print('')

			
 
				+

			
 
				+Selected_users_association_IG = [user]

			
 
				+

			
 
				+for i in interest_groups:

			
 
				+    interest_groups_set = set(i)

			
 
				+    user_likes_set = set(known_user_likes)

			
 
				+    sim_num = user_likes_set.intersection(interest_groups_set)

			
 
				+    sim_den = user_likes_set.union(interest_groups_set)

			
 
				+    sim = len(sim_num)/len(sim_den)

			
 
				+

			
 
				+    if sim > 0:

			
 
				+        g = 'User:', user_index, 'is associated to the Interest group:', i, 'with similarity:', sim

			
 
				+        ass_interest_groups = i

			
 
				+        Selected_users_association_IG.append(ass_interest_groups.tolist())

			
 
				+

			
 
				+print(Selected_users_association_IG[1])

			
 
				+

			
 
				+

			
 
				+#user_likes_set.intersection(interest_groups_set)

			
 
				+

			
 
				+print('\n\n#########################################CLIENT_SIDE_RECOMMENDATIONS###################################\n\n')

			
 
				+

			
 
				+left = []

			
 
				+right = []

			
 
				+R = []

			
 
				+

			
 
				+for i in range(0, len(d['itemsets'])):

			
 
				+    f_s = d['itemsets'][i]

			
 
				+    #print('Recommendation', i, 'is: ', f_s

			
 
				+    LHS = f_s

			
 
				+    RHS = f_s

			
 
				+    l, *_ = LHS

			
 
				+    *_, r = RHS

			
 
				+    #print(l)

			
 
				+    left.append(l)

			
 
				+    right.append(r)

			
 
				+#for index in range(1, len(Selected_users_association_IG)):

			
 
				+    #if l in set(Selected_users_association_IG[index]):

			
 
				+         #print(l,'exist')# LHS in user and if LHS present recommend

			
 
				+    if l in set(known_user_likes):

			
 
				+       print('user', user_index, 'gets recommendation:', r)

			
 
				+       R.append(r)

			
 
				+

			
 
				+

			
 
				+

			
 
				+precision = len(set(known_user_likes).intersection(set(R))) / len(set(R))

			
 
				+Recall = len(set(known_user_likes).intersection(set(R))) / len(known_user_likes)

			
 
				+

			
 
				+

			
 
				+    #print('Items to be checked in users list', l, '\n')

			
 
				+    #print('If item', l, 'is present', 'recommend: ', r, '\n')