test.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334
  1. import time
  2. import pandas as pd
  3. import multiprocessing as mp
  4. import numpy as np
  5. import psutil
  6. import os
  7. import distance
  8. import sklearn.cluster
  9. num_cores = mp.cpu_count()
  10. print("The kernal has",num_cores, "cores and you can find information regarding mermory usage in",
  11. psutil.virtual_memory())
  12. start_time = time.time()
  13. items = pd.read_csv("product.csv")
  14. print(os.path.getsize('product.csv'))
  15. items = np.asarray(items)
  16. merged_items = np.concatenate(items, axis=0)
  17. lev_similarity = -1*np.array([[distance.levenshtein(w1, w2)
  18. for w1 in merged_items] for w2 in merged_items])
  19. affprop = sklearn.cluster.AffinityPropagation(affinity="euclidean", damping=0.5, max_iter=200)
  20. affprop.fit(lev_similarity)
  21. for cluster_id in np.unique(affprop.labels_):
  22. exemplar = merged_items[affprop.cluster_centers_indices_[cluster_id]]
  23. cluster = np.unique(merged_items[np.nonzero(affprop.labels_ == cluster_id)])
  24. cluster_str = ", ".join(cluster)
  25. print(" - *%s:* %s" % (exemplar, cluster_str))
  26. print("--- %s seconds ---" % (time.time() - start_time))