mondrian.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # -*- coding: utf-8 -*-
  2. """
  3. main module of basic Mondrian
  4. """
  5. import pdb
  6. import time
  7. from functools import cmp_to_key
  8. from basic_mondrian.models.numrange import NumRange
  9. from basic_mondrian.utils.utility import cmp_str
  10. __DEBUG = False
  11. QI_LEN = 8
  12. SA_INDEX = []
  13. GL_K = 0
  14. RESULT = []
  15. ATT_TREES = []
  16. QI_RANGE = []
  17. IS_CAT = []
  18. class Partition(object):
  19. """Class for Group, which is used to keep records
  20. Store tree node in instances.
  21. self.member: records in group
  22. self.width: width of this partition on each domain. For categoric attribute, it equal
  23. the number of leaf node, for numeric attribute, it equal to number range
  24. self.middle: save the generalization result of this partition
  25. self.allow: 0 donate that not allow to split, 1 donate can be split
  26. """
  27. def __init__(self, data, width, middle):
  28. """
  29. initialize with data, width and middle
  30. """
  31. self.member = list(data)
  32. self.width = list(width)
  33. self.middle = list(middle)
  34. self.allow = [1] * QI_LEN
  35. def __len__(self):
  36. """
  37. return the number of records in partition
  38. """
  39. return len(self.member)
  40. def get_normalized_width(partition, index):
  41. """
  42. return Normalized width of partition
  43. similar to NCP
  44. """
  45. if IS_CAT[index] is False:
  46. low = partition.width[index][0]
  47. high = partition.width[index][1]
  48. width = float(ATT_TREES[index].sort_value[high]) - float(ATT_TREES[index].sort_value[low])
  49. else:
  50. width = partition.width[index]
  51. return width * 1.0 / QI_RANGE[index]
  52. def choose_dimension(partition):
  53. """
  54. chooss dim with largest normlized Width
  55. return dim index.
  56. """
  57. max_width = -1
  58. max_dim = -1
  59. for i in range(QI_LEN):
  60. if partition.allow[i] == 0:
  61. continue
  62. normWidth = get_normalized_width(partition, i)
  63. if normWidth > max_width:
  64. max_width = normWidth
  65. max_dim = i
  66. if max_width > 1:
  67. print("Error: max_width > 1")
  68. pdb.set_trace()
  69. if max_dim == -1:
  70. print("cannot find the max dim")
  71. pdb.set_trace()
  72. return max_dim
  73. def frequency_set(partition, dim):
  74. """
  75. get the frequency_set of partition on dim
  76. return dict{key: str values, values: count}
  77. """
  78. frequency = {}
  79. for record in partition.member:
  80. try:
  81. frequency[record[dim]] += 1
  82. except KeyError:
  83. frequency[record[dim]] = 1
  84. return frequency
  85. def find_median(partition, dim):
  86. """
  87. find the middle of the partition
  88. return splitVal
  89. """
  90. frequency = frequency_set(partition, dim)
  91. splitVal = ''
  92. value_list = list(frequency)
  93. value_list.sort(key=cmp_to_key(cmp_str))
  94. total = sum(frequency.values())
  95. middle = total / 2
  96. if middle < GL_K or len(value_list) <= 1:
  97. return ('', '', value_list[0], value_list[-1])
  98. index = 0
  99. split_index = 0
  100. for i, t in enumerate(value_list):
  101. index += frequency[t]
  102. if index >= middle:
  103. splitVal = t
  104. split_index = i
  105. break
  106. else:
  107. print("Error: cannot find splitVal")
  108. try:
  109. nextVal = value_list[split_index + 1]
  110. except IndexError:
  111. nextVal = splitVal
  112. return (splitVal, nextVal, value_list[0], value_list[-1])
  113. def split_numerical_value(numeric_value, splitVal):
  114. """
  115. split numeric value on splitVal
  116. return sub ranges
  117. """
  118. split_num = numeric_value.split(',')
  119. if len(split_num) <= 1:
  120. return split_num[0], split_num[0]
  121. else:
  122. low = split_num[0]
  123. high = split_num[1]
  124. # Fix 2,2 problem
  125. if low == splitVal:
  126. lvalue = low
  127. else:
  128. lvalue = low + ',' + splitVal
  129. if high == splitVal:
  130. rvalue = high
  131. else:
  132. rvalue = splitVal + ',' + high
  133. return lvalue, rvalue
  134. def split_numerical(partition, dim, pwidth, pmiddle):
  135. """
  136. strict split numeric attribute by finding a median,
  137. lhs = [low, means], rhs = (mean, high]
  138. """
  139. sub_partitions = []
  140. # numeric attributes
  141. (splitVal, nextVal, low, high) = find_median(partition, dim)
  142. #print(ATT_TREES[dim].dict, type(low))
  143. p_low = ATT_TREES[dim].dict[low]
  144. p_high = ATT_TREES[dim].dict[high]
  145. # update middle
  146. if low == high:
  147. pmiddle[dim] = low
  148. else:
  149. pmiddle[dim] = low + ',' + high
  150. pwidth[dim] = (p_low, p_high)
  151. if splitVal == '' or splitVal == nextVal:
  152. # update middle
  153. return []
  154. middle_pos = ATT_TREES[dim].dict[splitVal]
  155. lmiddle = pmiddle[:]
  156. rmiddle = pmiddle[:]
  157. lmiddle[dim], rmiddle[dim] = split_numerical_value(pmiddle[dim], splitVal)
  158. lhs = []
  159. rhs = []
  160. for temp in partition.member:
  161. pos = ATT_TREES[dim].dict[temp[dim]]
  162. if pos <= middle_pos:
  163. # lhs = [low, means]
  164. lhs.append(temp)
  165. else:
  166. # rhs = (mean, high]
  167. rhs.append(temp)
  168. lwidth = pwidth[:]
  169. rwidth = pwidth[:]
  170. lwidth[dim] = (pwidth[dim][0], middle_pos)
  171. rwidth[dim] = (ATT_TREES[dim].dict[nextVal], pwidth[dim][1])
  172. sub_partitions.append(Partition(lhs, lwidth, lmiddle))
  173. sub_partitions.append(Partition(rhs, rwidth, rmiddle))
  174. return sub_partitions
  175. def split_categorical(partition, dim, pwidth, pmiddle):
  176. """
  177. split categorical attribute using generalization hierarchy
  178. """
  179. sub_partitions = []
  180. # categoric attributes
  181. splitVal = ATT_TREES[dim][partition.middle[dim]]
  182. sub_node = [t for t in splitVal.child]
  183. sub_groups = []
  184. for i in range(len(sub_node)):
  185. sub_groups.append([])
  186. if len(sub_groups) == 0:
  187. # split is not necessary
  188. return []
  189. for temp in partition.member:
  190. qid_value = temp[dim]
  191. for i, node in enumerate(sub_node):
  192. try:
  193. node.cover[qid_value]
  194. sub_groups[i].append(temp)
  195. break
  196. except KeyError:
  197. continue
  198. else:
  199. print("Generalization hierarchy error!: " + qid_value)
  200. flag = True
  201. for index, sub_group in enumerate(sub_groups):
  202. if len(sub_group) == 0:
  203. continue
  204. if len(sub_group) < GL_K:
  205. flag = False
  206. break
  207. if flag:
  208. for i, sub_group in enumerate(sub_groups):
  209. if len(sub_group) == 0:
  210. continue
  211. wtemp = pwidth[:]
  212. mtemp = pmiddle[:]
  213. wtemp[dim] = len(sub_node[i])
  214. mtemp[dim] = sub_node[i].value
  215. sub_partitions.append(Partition(sub_group, wtemp, mtemp))
  216. return sub_partitions
  217. def split_partition(partition, dim):
  218. """
  219. split partition and distribute records to different sub-partitions
  220. """
  221. pwidth = partition.width
  222. pmiddle = partition.middle
  223. if IS_CAT[dim] is False:
  224. return split_numerical(partition, dim, pwidth, pmiddle)
  225. else:
  226. return split_categorical(partition, dim, pwidth, pmiddle)
  227. def anonymize(partition):
  228. """
  229. Main procedure of Half_Partition.
  230. recursively partition groups until not allowable.
  231. """
  232. if check_splitable(partition) is False:
  233. RESULT.append(partition)
  234. return
  235. # Choose dim
  236. dim = choose_dimension(partition)
  237. if dim == -1:
  238. print("Error: dim=-1")
  239. pdb.set_trace()
  240. sub_partitions = split_partition(partition, dim)
  241. if len(sub_partitions) == 0:
  242. partition.allow[dim] = 0
  243. anonymize(partition)
  244. else:
  245. for sub_p in sub_partitions:
  246. anonymize(sub_p)
  247. def check_splitable(partition):
  248. """
  249. Check if the partition can be further splited while satisfying k-anonymity.
  250. """
  251. temp = sum(partition.allow)
  252. if temp == 0:
  253. return False
  254. return True
  255. def init(att_trees, data, k, QI_num, SA_num):
  256. """
  257. reset all global variables
  258. """
  259. global GL_K, RESULT, QI_LEN, ATT_TREES, QI_RANGE, IS_CAT, SA_INDEX
  260. ATT_TREES = att_trees
  261. IS_CAT = []
  262. for t in att_trees:
  263. if isinstance(t, NumRange):
  264. IS_CAT.append(False)
  265. else:
  266. IS_CAT.append(True)
  267. QI_LEN = QI_num
  268. SA_INDEX = SA_num
  269. GL_K = k
  270. RESULT = []
  271. QI_RANGE = []
  272. def mondrian(att_trees, data, k, QI_num, SA_num):
  273. """
  274. basic Mondrian for k-anonymity.
  275. This fuction support both numeric values and categoric values.
  276. For numeric values, each iterator is a mean split.
  277. For categoric values, each iterator is a split on GH.
  278. The final result is returned in 2-dimensional list.
  279. """
  280. init(att_trees, data, k, QI_num, SA_num)
  281. result = []
  282. middle = []
  283. wtemp = []
  284. for i in range(QI_LEN):
  285. if IS_CAT[i] is False:
  286. QI_RANGE.append(ATT_TREES[i].range)
  287. wtemp.append((0, len(ATT_TREES[i].sort_value) - 1))
  288. middle.append(ATT_TREES[i].value)
  289. else:
  290. QI_RANGE.append(len(ATT_TREES[i]['*']))
  291. wtemp.append(len(ATT_TREES[i]['*']))
  292. middle.append('*')
  293. whole_partition = Partition(data, wtemp, middle)
  294. start_time = time.time()
  295. anonymize(whole_partition)
  296. rtime = float(time.time() - start_time)
  297. ncp = 0.0
  298. for partition in RESULT:
  299. r_ncp = 0.0
  300. for i in range(QI_LEN):
  301. r_ncp += get_normalized_width(partition, i)
  302. temp = partition.middle
  303. for i in range(len(partition)):
  304. temp_for_SA = []
  305. for s in range(len(partition.member[i]) - len(SA_INDEX), len(partition.member[i])):
  306. temp_for_SA = temp_for_SA + [partition.member[i][s]]
  307. result.append(temp + temp_for_SA)
  308. r_ncp *= len(partition)
  309. ncp += r_ncp
  310. # covert to NCP percentage
  311. ncp /= QI_LEN
  312. ncp /= len(data)
  313. ncp *= 100
  314. if len(result) != len(data):
  315. print("Losing records during anonymization!!")
  316. pdb.set_trace()
  317. if __DEBUG:
  318. print("K=%d" % k)
  319. print("size of partitions")
  320. print(len(RESULT))
  321. temp = [len(t) for t in RESULT]
  322. print(sorted(temp))
  323. print("NCP = %.2f %%" % ncp)
  324. return (result, (ncp, rtime))