clustering_based_k_anon.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. # -*- coding: utf-8 -*-
  2. """
  3. main module for cluster_based_k_anon
  4. """
  5. import operator
  6. import random
  7. import time
  8. from functools import cmp_to_key
  9. from basic_mondrian.models.numrange import NumRange
  10. from basic_mondrian.utils.utility import (cmp_str, get_num_list_from_str,
  11. qid_to_key)
  12. __DEBUG = False
  13. # att_tree store root node for each att
  14. ATT_TREES = []
  15. # databack store all record for dataset
  16. LEN_DATA = 0
  17. QI_LEN = 0
  18. QI_RANGE = []
  19. IS_CAT = []
  20. # get_LCA, gen_result and NCP require huge running time, while most of the function are duplicate
  21. # we can use cache to reduce the running time
  22. LCA_CACHE = []
  23. NCP_CACHE = {}
  24. class Cluster(object):
  25. """Cluster is for cluster based k-anonymity
  26. self.member: record list in cluster
  27. self.gen_result: generlized value for one cluster
  28. """
  29. def __init__(self, member, gen_result, information_loss=0.0):
  30. self.information_loss = information_loss
  31. self.member = member
  32. self.gen_result = gen_result[:]
  33. self.center = gen_result[:]
  34. for i in range(QI_LEN):
  35. if IS_CAT[i] is False:
  36. self.center[i] = str(sum([float(t[i]) for t in self.member]) * 1.0 / len(self.member))
  37. def add_record(self, record):
  38. """
  39. add record to cluster
  40. """
  41. self.member.append(record)
  42. self.update_gen_result(record, record)
  43. def update_cluster(self):
  44. """update cluster information when member is changed
  45. """
  46. self.gen_result = cluster_generalization(self.member)
  47. for i in range(QI_LEN):
  48. if IS_CAT[i]:
  49. self.center[i] = self.gen_result[i]
  50. else:
  51. self.center[i] = str(sum([float(t[i]) for t in self.member]) * 1.0 / len(self.member))
  52. self.information_loss = len(self.member) * NCP(self.gen_result)
  53. def update_gen_result(self, merge_gen_result, center, num=1):
  54. """
  55. update gen_result and information_loss after adding record or merging cluster
  56. :param merge_gen_result:
  57. :return:
  58. """
  59. self.gen_result = generalization(self.gen_result, merge_gen_result)
  60. current_len = len(self.member)
  61. for i in range(QI_LEN):
  62. if IS_CAT[i]:
  63. self.center[i] = self.gen_result[i]
  64. else:
  65. self.center[i] = str((float(self.center[i]) * (current_len - num) +
  66. float(center[i]) * num) / current_len)
  67. self.information_loss = len(self.member) * NCP(self.gen_result)
  68. def add_same_record(self, record):
  69. """
  70. add record with same qid to cluster
  71. """
  72. self.member.append(record)
  73. def merge_cluster(self, cluster):
  74. """merge cluster into self and do not delete cluster elements.
  75. update self.gen_result
  76. """
  77. self.member.extend(cluster.member)
  78. self.update_gen_result(cluster.gen_result, cluster.center, len(cluster))
  79. def __getitem__(self, item):
  80. """
  81. :param item: index number
  82. :return: gen_result[item]
  83. """
  84. return self.gen_result[item]
  85. def __len__(self):
  86. """
  87. return number of records in cluster
  88. """
  89. return len(self.member)
  90. def __str__(self):
  91. return str(self.gen_result)
  92. def r_distance(source, target):
  93. """
  94. Return distance between source (cluster or record)
  95. and target (cluster or record). The distance is based on
  96. NCP (Normalized Certainty Penalty) on relational part.
  97. If source or target are cluster, func need to multiply
  98. source_len (or target_len).
  99. """
  100. source_gen = source
  101. target_gen = target
  102. source_len = 1
  103. target_len = 1
  104. # check if target is Cluster
  105. if isinstance(target, Cluster):
  106. target_gen = target.gen_result
  107. target_len = len(target)
  108. # check if souce is Cluster
  109. if isinstance(source, Cluster):
  110. source_gen = source.gen_result
  111. source_len = len(source)
  112. if source_gen == target_gen:
  113. return 0
  114. gen = generalization(source_gen, target_gen)
  115. # len should be taken into account
  116. distance = (source_len + target_len) * NCP(gen)
  117. return distance
  118. def diff_distance(record, cluster):
  119. """
  120. Return IL(cluster and record) - IL(cluster).
  121. """
  122. gen_after = generalization(record, cluster.gen_result)
  123. return NCP(gen_after) * (len(cluster) + 1) - cluster.information_loss
  124. def NCP(record):
  125. """Compute NCP (Normalized Certainty Penalty)
  126. when generate record to gen_result.
  127. """
  128. ncp = 0.0
  129. # exclude SA values(last one type [])
  130. list_key = qid_to_key(record)
  131. try:
  132. return NCP_CACHE[list_key]
  133. except KeyError:
  134. pass
  135. for i in range(QI_LEN):
  136. # if leaf_num of numerator is 1, then NCP is 0
  137. width = 0.0
  138. if IS_CAT[i] is False:
  139. try:
  140. float(record[i])
  141. except ValueError:
  142. temp = record[i].split(',')
  143. width = float(temp[1]) - float(temp[0])
  144. else:
  145. width = len(ATT_TREES[i][record[i]]) * 1.0
  146. width /= QI_RANGE[i]
  147. ncp += width
  148. NCP_CACHE[list_key] = ncp
  149. return ncp
  150. def get_LCA(index, item1, item2):
  151. """Get lowest commmon ancestor (including themselves)"""
  152. # get parent list from
  153. if item1 == item2:
  154. return item1
  155. try:
  156. return LCA_CACHE[index][item1 + item2]
  157. except KeyError:
  158. pass
  159. parent1 = ATT_TREES[index][item1].parent[:]
  160. parent2 = ATT_TREES[index][item2].parent[:]
  161. parent1.insert(0, ATT_TREES[index][item1])
  162. parent2.insert(0, ATT_TREES[index][item2])
  163. min_len = min(len(parent1), len(parent2))
  164. last_LCA = parent1[-1]
  165. # note here: when trying to access list reversely, take care of -0
  166. for i in range(1, min_len + 1):
  167. if parent1[-i].value == parent2[-i].value:
  168. last_LCA = parent1[-i]
  169. else:
  170. break
  171. LCA_CACHE[index][item1 + item2] = last_LCA.value
  172. return last_LCA.value
  173. def generalization(record1, record2):
  174. """
  175. Compute relational generalization result of record1 and record2
  176. """
  177. gen = []
  178. for i in range(QI_LEN):
  179. if IS_CAT[i] is False:
  180. split_number = []
  181. split_number.extend(get_num_list_from_str(record1[i]))
  182. split_number.extend(get_num_list_from_str(record2[i]))
  183. split_number = list(set(split_number))
  184. if len(split_number) == 1:
  185. gen.append(split_number[0])
  186. else:
  187. split_number.sort(key=cmp_to_key(cmp_str))
  188. gen.append(split_number[0] + ',' + split_number[-1])
  189. else:
  190. gen.append(get_LCA(i, record1[i], record2[i]))
  191. return gen
  192. def cluster_generalization(records):
  193. """
  194. calculat gen_result of records(list) recursively.
  195. Compute both relational gen_result for records (list).
  196. """
  197. len_r = len(records)
  198. gen = records[0]
  199. for i in range(1, len_r):
  200. gen = generalization(gen, records[i])
  201. return gen
  202. def find_best_knn(index, k, data):
  203. """key fuction of KNN. Find k nearest neighbors of record, remove them from data"""
  204. dist_dict = {}
  205. record = data[index]
  206. # add random seed to cluster
  207. for i, t in enumerate(data):
  208. if i == index:
  209. continue
  210. dist = r_distance(record, t)
  211. dist_dict[i] = dist
  212. sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1))
  213. knn = sorted_dict[:k - 1]
  214. knn.append((index, 0))
  215. record_index = [t[0] for t in knn]
  216. elements = [data[t[0]] for t in knn]
  217. gen = cluster_generalization(elements)
  218. cluster = Cluster(elements, gen, k * NCP(gen))
  219. # delete multiple elements from data according to knn index list
  220. return cluster, record_index
  221. def find_best_cluster_iloss(record, clusters):
  222. """residual assignment. Find best cluster for record."""
  223. min_distance = 1000000000000
  224. min_index = 0
  225. best_cluster = clusters[0]
  226. for i, t in enumerate(clusters):
  227. distance = r_distance(record, t.gen_result)
  228. if distance < min_distance:
  229. min_distance = distance
  230. min_index = i
  231. best_cluster = t
  232. # add record to best cluster
  233. return min_index
  234. def find_best_cluster_iloss_increase(record, clusters):
  235. """residual assignment. Find best cluster for record."""
  236. min_diff = 1000000000000
  237. min_index = 0
  238. best_cluster = clusters[0]
  239. for i, t in enumerate(clusters):
  240. IF_diff = diff_distance(record, t)
  241. if IF_diff < min_diff:
  242. min_distance = IF_diff
  243. min_index = i
  244. best_cluster = t
  245. # add record to best cluster
  246. return min_index
  247. def find_furthest_record(record, data):
  248. """
  249. :param record: the latest record be added to cluster
  250. :param data: remain records in data
  251. :return: the index of the furthest record from r_index
  252. """
  253. max_distance = 0
  254. max_index = -1
  255. for index in range(len(data)):
  256. current_distance = r_distance(record, data[index])
  257. if current_distance >= max_distance:
  258. max_distance = current_distance
  259. max_index = index
  260. return max_index
  261. def find_best_record_iloss_increase(cluster, data):
  262. """
  263. :param cluster: current
  264. :param data: remain dataset
  265. :return: index of record with min diff on information loss
  266. """
  267. min_diff = 1000000000000
  268. min_index = 0
  269. for index, record in enumerate(data):
  270. # IL(cluster and record) and |cluster| + 1 is a constant
  271. # so IL(record, cluster.gen_result) is enough
  272. IF_diff = diff_distance(record, cluster)
  273. if IF_diff < min_diff:
  274. min_diff = IF_diff
  275. min_index = index
  276. return min_index
  277. def clustering_knn(data, k=25):
  278. """
  279. Group record according to QID distance. KNN
  280. """
  281. clusters = []
  282. # randomly choose seed and find k-1 nearest records to form cluster with size k
  283. while len(data) >= k:
  284. index = random.randrange(len(data))
  285. cluster, record_index = find_best_knn(index, k, data)
  286. data = [t for i, t in enumerate(data[:]) if i not in set(record_index)]
  287. clusters.append(cluster)
  288. # residual assignment
  289. while len(data) > 0:
  290. t = data.pop()
  291. cluster_index = find_best_cluster_iloss(t, clusters)
  292. clusters[cluster_index].add_record(t)
  293. return clusters
  294. def clustering_kmember(data, k=25):
  295. """
  296. Group record according to NCP. K-member
  297. """
  298. clusters = []
  299. # randomly choose seed and find k-1 nearest records to form cluster with size k
  300. r_pos = random.randrange(len(data))
  301. r_i = data[r_pos]
  302. while len(data) >= k:
  303. r_pos = find_furthest_record(r_i, data)
  304. r_i = data.pop(r_pos)
  305. cluster = Cluster([r_i], r_i)
  306. while len(cluster) < k:
  307. r_pos = find_best_record_iloss_increase(cluster, data)
  308. r_j = data.pop(r_pos)
  309. cluster.add_record(r_j)
  310. clusters.append(cluster)
  311. # residual assignment
  312. while len(data) > 0:
  313. t = data.pop()
  314. cluster_index = find_best_cluster_iloss_increase(t, clusters)
  315. clusters[cluster_index].add_record(t)
  316. return clusters
  317. def adjust_cluster(cluster, residual, k):
  318. center = cluster.center
  319. dist_dict = {}
  320. # add random seed to cluster
  321. for i, t in enumerate(cluster.member):
  322. dist = r_distance(center, t)
  323. dist_dict[i] = dist
  324. sorted_dict = sorted(dist_dict.items(), key=operator.itemgetter(1))
  325. need_adjust_index = [t[0] for t in sorted_dict[k:]]
  326. need_adjust = [cluster.member[t] for t in need_adjust_index]
  327. residual.extend(need_adjust)
  328. # update cluster
  329. cluster.member = [t for i, t in enumerate(cluster.member)
  330. if i not in set(need_adjust_index)]
  331. cluster.update_cluster()
  332. def clustering_oka(data, k=25):
  333. """
  334. Group record according to NCP. OKA: one time pass k-means
  335. """
  336. clusters = []
  337. can_clusters = []
  338. less_clusters = []
  339. # randomly choose seed and find k-1 nearest records to form cluster with size k
  340. # MODIFIED: int cast
  341. seed_index = random.sample(range(len(data)), int(len(data) / k))
  342. for index in seed_index:
  343. record = data[index]
  344. can_clusters.append(Cluster([record], record))
  345. data = [t for i, t in enumerate(data[:]) if i not in set(seed_index)]
  346. while len(data) > 0:
  347. record = data.pop()
  348. index = find_best_cluster_iloss(record, can_clusters)
  349. can_clusters[index].add_record(record)
  350. residual = []
  351. for cluster in can_clusters:
  352. if len(cluster) < k:
  353. less_clusters.append(cluster)
  354. else:
  355. if len(cluster) > k:
  356. adjust_cluster(cluster, residual, k)
  357. clusters.append(cluster)
  358. while len(residual) > 0:
  359. record = residual.pop()
  360. if len(less_clusters) > 0:
  361. index = find_best_cluster_iloss(record, less_clusters)
  362. less_clusters[index].add_record(record)
  363. if len(less_clusters[index]) >= k:
  364. clusters.append(less_clusters.pop(index))
  365. else:
  366. index = find_best_cluster_iloss(record, clusters)
  367. clusters[index].add_record(record)
  368. return clusters
  369. def init(att_trees, data, SA_num, QI_num=-1):
  370. """
  371. init global variables
  372. """
  373. global ATT_TREES, DATA_BACKUP, LEN_DATA, QI_RANGE, IS_CAT, QI_LEN, LCA_CACHE, NCP_CACHE, SA_INDEX
  374. SA_INDEX = SA_num
  375. ATT_TREES = att_trees
  376. QI_RANGE = []
  377. IS_CAT = []
  378. LEN_DATA = len(data)
  379. LCA_CACHE = []
  380. NCP_CACHE = {}
  381. if QI_num <= 0:
  382. QI_LEN = len(data[0]) - 1
  383. else:
  384. QI_LEN = QI_num
  385. for i in range(QI_LEN):
  386. LCA_CACHE.append(dict())
  387. if isinstance(ATT_TREES[i], NumRange):
  388. IS_CAT.append(False)
  389. QI_RANGE.append(ATT_TREES[i].range)
  390. else:
  391. IS_CAT.append(True)
  392. QI_RANGE.append(len(ATT_TREES[i]['*']))
  393. def clustering_based_k_anon(att_trees, data, k, QI_num, SA_num, type_alg):
  394. """
  395. the main function of clustering based k-anon
  396. """
  397. init(att_trees, data, SA_num, QI_num)
  398. result = []
  399. start_time = time.time()
  400. if type_alg == 'knn':
  401. print("Begin to KNN Cluster based on NCP")
  402. clusters = clustering_knn(data, k)
  403. elif type_alg == 'kmember':
  404. print("Begin to K-Member Cluster based on NCP")
  405. clusters = clustering_kmember(data, k)
  406. elif type_alg == 'oka':
  407. print("Begin to OKA Cluster based on NCP")
  408. clusters = clustering_oka(data, k)
  409. else:
  410. print("Please choose merge algorithm types")
  411. print("knn | kmember")
  412. return (0, (0, 0))
  413. rtime = float(time.time() - start_time)
  414. ncp = 0.0
  415. for cluster in clusters:
  416. final_result = []
  417. for i in range(len(cluster)):
  418. # Custom! For non QI Values
  419. tmp = []
  420. for s in range(len(cluster.member[i]) - len(SA_INDEX), len(cluster.member[i])):
  421. tmp += [cluster.member[i][s]]
  422. final_result.append(cluster.gen_result + tmp)
  423. result.extend(final_result)
  424. ncp += cluster.information_loss
  425. ncp /= LEN_DATA
  426. ncp /= QI_LEN
  427. ncp *= 100
  428. if __DEBUG:
  429. print("NCP=", ncp)
  430. return (result, (ncp, rtime))