top_down_greedy_anonymization.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. # -*- coding: utf-8 -*-
  2. """
  3. Main module of top down greedy anonymizaiton algorithm
  4. """
  5. import operator
  6. import pdb
  7. import random
  8. import time
  9. from functools import cmp_to_key
  10. from basic_mondrian.models.numrange import NumRange
  11. from basic_mondrian.utils.utility import cmp_str, get_num_list_from_str
  12. __DEBUG = False
  13. QI_LEN = 5
  14. GL_K = 0
  15. RESULT = []
  16. ATT_TREES = []
  17. QI_RANGE = []
  18. ROUNDS = 3
  19. IS_CAT = []
  20. SA_INDEX = []
  21. class Partition(object):
  22. """
  23. Class for Group, which is used to keep records
  24. Store tree node in instances.
  25. self.member: records in group
  26. self.middle: save the generalization result of this partition
  27. """
  28. def __init__(self, data, middle):
  29. """
  30. initialize with data and middle
  31. """
  32. self.can_split = True
  33. self.member = data[:]
  34. self.middle = middle[:]
  35. def __len__(self):
  36. """
  37. return the number of records in partition
  38. """
  39. return len(self.member)
  40. def NCP(record):
  41. """
  42. compute Certainlty Penalty of records
  43. """
  44. record_ncp = 0.0
  45. for i in range(QI_LEN):
  46. if IS_CAT[i] is False:
  47. value_ncp = 0
  48. try:
  49. float(record[i])
  50. except ValueError:
  51. split_number = record[i].split(',')
  52. value_ncp = float(split_number[1]) - float(split_number[0])
  53. value_ncp = value_ncp * 1.0 / QI_RANGE[i]
  54. record_ncp += value_ncp
  55. else:
  56. record_ncp += len(ATT_TREES[i][record[i]]) * 1.0 / QI_RANGE[i]
  57. return record_ncp
  58. def NCP_dis(record1, record2):
  59. """
  60. use the NCP of generalization record1 and record2 as distance
  61. """
  62. mid = middle_record(record1, record2)
  63. return NCP(mid), mid
  64. def NCP_dis_merge(partition, addition_set):
  65. """
  66. merge addition_set to current partition,
  67. update current partition.middle
  68. """
  69. mid = middle_group(addition_set)
  70. mid = middle_record(mid, partition.middle)
  71. return (len(addition_set) + len(partition)) * NCP(mid), mid
  72. def NCP_dis_group(record, partition):
  73. """
  74. compute the NCP of record and partition
  75. """
  76. mid = middle_record(record, partition.middle)
  77. ncp = NCP(mid)
  78. return (1 + len(partition)) * ncp
  79. def middle_record(record1, record2):
  80. """
  81. get the generalization result of record1 and record2
  82. """
  83. mid = []
  84. for i in range(QI_LEN):
  85. if IS_CAT[i] is False:
  86. split_number = []
  87. split_number.extend(get_num_list_from_str(record1[i]))
  88. split_number.extend(get_num_list_from_str(record2[i]))
  89. split_number.sort(key=cmp_to_key(cmp_str))
  90. # avoid 2,2 problem
  91. if split_number[0] == split_number[-1]:
  92. mid.append(split_number[0])
  93. else:
  94. mid.append(split_number[0] + ',' + split_number[-1])
  95. else:
  96. mid.append(LCA(record1[i], record2[i], i))
  97. return mid
  98. def middle_group(group_set):
  99. """
  100. get the generalization result of the group
  101. """
  102. len_group_set = len(group_set)
  103. mid = group_set[0]
  104. for i in range(1, len_group_set):
  105. mid = middle_record(mid, group_set[i])
  106. return mid
  107. def LCA(u, v, index):
  108. """
  109. get lowest common ancestor of u, v on generalization hierarchy (index)
  110. """
  111. gen_tree = ATT_TREES[index]
  112. # don't forget to add themselves (other the level will be higher)
  113. u_parent = list(gen_tree[u].parent)
  114. u_parent.insert(0, gen_tree[u])
  115. v_parent = list(gen_tree[v].parent)
  116. v_parent.insert(0, gen_tree[v])
  117. min_len = min(len(u_parent), len(v_parent))
  118. if min_len == 0:
  119. return '*'
  120. last = -1
  121. for i in range(min_len):
  122. pos = - 1 - i
  123. if u_parent[pos] != v_parent[pos]:
  124. break
  125. last = pos
  126. return u_parent[last].value
  127. def get_pair(partition):
  128. """
  129. To get max distance pair in partition, we need O(n^2) running time.
  130. The author proposed a heuristic method: random pick u and get max_dis(u, v)
  131. with O(n) running tiem; then pick max(v, u2)...after run ROUNDS times.
  132. the dis(u, v) is nearly max.
  133. """
  134. len_partition = len(partition)
  135. for i in range(ROUNDS):
  136. if i == 0:
  137. u = random.randrange(len_partition)
  138. else:
  139. u = v
  140. max_dis = -1
  141. max_index = 0
  142. for i in range(len_partition):
  143. if i != u:
  144. rncp, _ = NCP_dis(partition.member[i], partition.member[u])
  145. if rncp > max_dis:
  146. max_dis = rncp
  147. max_index = i
  148. v = max_index
  149. return (u, v)
  150. def distribute_record(u, v, partition):
  151. """
  152. Distribute records based on NCP distance.
  153. records will be assigned to nearer group.
  154. """
  155. record_u = partition.member[u][:]
  156. record_v = partition.member[v][:]
  157. u_partition = [record_u]
  158. v_partition = [record_v]
  159. remain_records = [item for index, item in enumerate(partition.member) if index not in set([u, v])]
  160. for record in remain_records:
  161. u_dis, _ = NCP_dis(record_u, record)
  162. v_dis, _ = NCP_dis(record_v, record)
  163. if u_dis > v_dis:
  164. v_partition.append(record)
  165. else:
  166. u_partition.append(record)
  167. return [Partition(u_partition, middle_group(u_partition)),
  168. Partition(v_partition, middle_group(v_partition))]
  169. def balance(sub_partitions, index):
  170. """
  171. Two kinds of balance methods.
  172. 1) Move some records from other groups
  173. 2) Merge with nearest group
  174. The algorithm will choose one of them with minimal NCP
  175. index store the sub_partition with less than k records
  176. """
  177. less = sub_partitions.pop(index)
  178. more = sub_partitions.pop()
  179. all_length = len(less) + len(more)
  180. require = GL_K - len(less)
  181. # First method
  182. dist = {}
  183. for i, record in enumerate(more.member):
  184. dist[i], _ = NCP_dis(less.middle, record)
  185. sorted_dist = sorted(dist.items(),
  186. key=operator.itemgetter(1))
  187. nearest_index = [t[0] for t in sorted_dist[:require]]
  188. addition_set = [t for i, t in enumerate(more.member) if i in set(nearest_index)]
  189. remain_set = [t for i, t in enumerate(more.member) if i not in set(nearest_index)]
  190. first_ncp, first_mid = NCP_dis_merge(less, addition_set)
  191. r_middle = middle_group(remain_set)
  192. first_ncp += len(remain_set) * NCP(r_middle)
  193. # Second method
  194. second_ncp, second_mid = NCP_dis(less.middle, more.middle)
  195. second_ncp *= all_length
  196. if first_ncp <= second_ncp:
  197. less.member.extend(addition_set)
  198. less.middle = first_mid
  199. more.member = remain_set
  200. more.middle = r_middle
  201. sub_partitions.append(more)
  202. else:
  203. less.member.extend(more.member)
  204. less.middle = second_mid
  205. less.can_split = False
  206. sub_partitions.append(less)
  207. def can_split(partition):
  208. """
  209. check if partition can be further splited.
  210. """
  211. if partition.can_split is False:
  212. return False
  213. if len(partition) < 2 * GL_K:
  214. return False
  215. return True
  216. def anonymize(partition):
  217. """
  218. Main procedure of top_down_greedy_anonymization.
  219. recursively partition groups until not allowable.
  220. """
  221. if can_split(partition) is False:
  222. RESULT.append(partition)
  223. return
  224. u, v = get_pair(partition)
  225. sub_partitions = distribute_record(u, v, partition)
  226. if len(sub_partitions[0]) < GL_K:
  227. balance(sub_partitions, 0)
  228. elif len(sub_partitions[1]) < GL_K:
  229. balance(sub_partitions, 1)
  230. # watch dog
  231. p_sum = len(partition)
  232. c_sum = 0
  233. for sub_partition in sub_partitions:
  234. c_sum += len(sub_partition)
  235. if p_sum != c_sum:
  236. pdb.set_trace()
  237. for sub_partition in sub_partitions:
  238. anonymize(sub_partition)
  239. def init(att_trees, data, k, QI_num, SA_num):
  240. """
  241. reset all gloabl variables
  242. """
  243. global GL_K, RESULT, QI_LEN, ATT_TREES, QI_RANGE, IS_CAT, SA_INDEX
  244. ATT_TREES = att_trees
  245. for t in att_trees:
  246. if isinstance(t, NumRange):
  247. IS_CAT.append(False)
  248. else:
  249. IS_CAT.append(True)
  250. QI_LEN = QI_num
  251. SA_INDEX = SA_num
  252. GL_K = k
  253. RESULT = []
  254. QI_RANGE = []
  255. def Top_Down_Greedy_Anonymization(att_trees, data, k, QI_num, SA_num):
  256. """
  257. Top Down Greedy Anonymization is a heuristic algorithm
  258. for relational dataset with numeric and categorical attbitues
  259. """
  260. init(att_trees, data, k, QI_num, SA_num)
  261. result = []
  262. middle = []
  263. for i in range(QI_LEN):
  264. if IS_CAT[i] is False:
  265. QI_RANGE.append(ATT_TREES[i].range)
  266. middle.append(ATT_TREES[i].value)
  267. else:
  268. QI_RANGE.append(len(ATT_TREES[i]['*']))
  269. middle.append('*')
  270. whole_partition = Partition(data, middle)
  271. start_time = time.time()
  272. anonymize(whole_partition)
  273. rtime = float(time.time() - start_time)
  274. ncp = 0.0
  275. dp = 0.0
  276. for sub_partition in RESULT:
  277. rncp = 0.0
  278. gen_result = sub_partition.middle
  279. rncp = NCP(gen_result)
  280. for i in range(len(sub_partition)):
  281. temp_for_SA = []
  282. for s in range(len(sub_partition.member[i]) - len(SA_INDEX), len(sub_partition.member[i])):
  283. temp_for_SA = temp_for_SA + [sub_partition.member[i][s]]
  284. result.append(gen_result[:] + temp_for_SA)
  285. rncp *= len(sub_partition)
  286. dp += len(sub_partition) ** 2
  287. ncp += rncp
  288. # covert NCP to percentage
  289. ncp /= len(data)
  290. ncp /= QI_LEN
  291. ncp *= 100
  292. if __DEBUG:
  293. from decimal import Decimal
  294. print("Discernability Penalty=%.2E" % Decimal(str(dp)))
  295. print("K=%d" % k)
  296. print("size of partitions")
  297. print(len(RESULT))
  298. print([len(partition) for partition in RESULT])
  299. print("NCP = %.2f %%" % ncp)
  300. print("Total running time = %.2f" % rtime)
  301. return (result, (ncp, rtime))