kanon.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. # -*- coding: utf-8 -*-
  2. class AnonCheck:
  3. def __init__(self, raw_data, max_gen, gen_strat, allowed_suppressed, k):
  4. """Creates the initial generalization Values
  5. Parameters:
  6. raw_data: list
  7. The data read from a csv file (data[col][row])
  8. max_gen: list
  9. Array of max level of each quasi-identifier (in order)
  10. gen_strat: list
  11. Array of functions to generalize each quasi-identifier (in order)
  12. allowed_suppressed: int
  13. Number of rows that are allowed to be suppressed
  14. k: int
  15. Number that defines what k-Anonymity is desired
  16. """
  17. self.gtime = 0
  18. self.k = k
  19. self.allowed_suppressed = allowed_suppressed
  20. self.raw_rows_count = len(raw_data[0])
  21. self.raw_cols_count = len(raw_data)
  22. self.hier_array = []
  23. self.data_array = []
  24. dict_array = []
  25. self.buffer = [[0 for _ in range(self.raw_cols_count)] for _ in range(self.raw_rows_count)]
  26. self.prev_gen_to_apply = None
  27. self.eq_classes_dict = None
  28. self.is_transformed = True
  29. key_rows = range(self.raw_rows_count)
  30. vals = [1] * self.raw_rows_count
  31. self.dummy_raw_eq_classes = list(zip(vals, key_rows))
  32. for col in range(self.raw_cols_count):
  33. # New list entries for this column
  34. self.hier_array.append([])
  35. self.data_array.append([])
  36. quasi_identifier = raw_data[col]
  37. quasi_identifier_set = set(quasi_identifier)
  38. # Create dictionary of quasi_identifier with index as key
  39. dict_array.append({k: v for k, v in enumerate(quasi_identifier_set)})
  40. # Switch key and value
  41. dict_reverse = dict((v, k) for k, v in dict_array[col].items())
  42. # Create data_array with numerical references to original data in dict_array
  43. for v in quasi_identifier:
  44. self.data_array[col].append(dict_reverse[v])
  45. self.test = []
  46. for r in range(len(self.data_array[0])):
  47. self.test.append([])
  48. for c in range(len(self.data_array)):
  49. self.test[-1].append(self.data_array[c][r])
  50. # Fill first generalization level with the numerical reference values
  51. self.hier_array[col].append(list(dict_array[-1].keys()))
  52. # Iterate generalization levels
  53. for level in range(0, max_gen[col]):
  54. idexes = apply_generalization(quasi_identifier_set, gen_strat[col], level, dict_array[-1])
  55. # Append numerical reference values for this level
  56. self.hier_array[col].append(idexes)
  57. def calculate_kanon(self, node):
  58. gen_to_apply = node.attributes
  59. is_rollup_allowed = False
  60. raw_eq_classes = self.dummy_raw_eq_classes
  61. if self.eq_classes_dict is not None:
  62. prev_level = sum(self.prev_gen_to_apply)
  63. level = sum(gen_to_apply)
  64. if level > prev_level:
  65. is_rollup_allowed = True
  66. for num in range(len(gen_to_apply)):
  67. if gen_to_apply[num] < self.prev_gen_to_apply[num]:
  68. is_rollup_allowed = False
  69. break
  70. if is_rollup_allowed:
  71. self.is_transformed = False
  72. # Roll-up
  73. raw_eq_classes = self.eq_classes_dict.values()
  74. # Projection
  75. cols_to_iterate = range(self.raw_cols_count)
  76. if self.is_transformed or is_rollup_allowed:
  77. if self.prev_gen_to_apply is not None:
  78. cols_to_iterate = [i for i, value in enumerate(gen_to_apply) if value != self.prev_gen_to_apply[i]]
  79. if not is_rollup_allowed:
  80. self.is_transformed = True
  81. self.eq_classes_dict = {}
  82. update = self.eq_classes_dict.update
  83. for val, key_row in raw_eq_classes:
  84. tmp = self.buffer[key_row]
  85. rawr = self.test[key_row]
  86. for col in cols_to_iterate:
  87. tmp[col] = self.hier_array[col][gen_to_apply[col]][rawr[col]]
  88. tup = tuple(tmp)
  89. try:
  90. self.eq_classes_dict[tup][0] += val
  91. except KeyError:
  92. update({tup: [val, key_row]})
  93. self.prev_gen_to_apply = gen_to_apply.copy()
  94. suppressed_count = 0
  95. eq_classes = sorted(list(zip(*self.eq_classes_dict.values()))[0])
  96. eqsum = 0
  97. amount = 0
  98. for v in eq_classes:
  99. if v < self.k:
  100. suppressed_count += v
  101. if not suppressed_count > self.allowed_suppressed:
  102. node.DM_penalty += v * self.raw_rows_count
  103. else:
  104. eqsum += v
  105. amount += 1
  106. node.DM_penalty += v*v
  107. node.DMs_penalty += v*v
  108. if amount == 0:
  109. node.eqclasses = 0
  110. else:
  111. node.eqclasses = eqsum/amount
  112. suppressed_count = 0
  113. for v in eq_classes:
  114. if v < self.k:
  115. if self.allowed_suppressed == 0:
  116. return False
  117. suppressed_count += v
  118. if suppressed_count > self.allowed_suppressed:
  119. return False
  120. else:
  121. return True
  122. return True
  123. def apply_generalization(data, strat, level, dictionary):
  124. tmp_array = []
  125. gen_index = []
  126. count = len(dictionary)-1
  127. for v in data:
  128. if isinstance(strat, list):
  129. args = []
  130. for arg_len in range(1, len(strat)):
  131. args.append(strat[arg_len])
  132. vg = strat[0](v, level, *tuple(args))
  133. else:
  134. vg = strat(v, level)
  135. if isinstance(vg, list):
  136. vg = vg[0]
  137. if vg not in tmp_array:
  138. count += 1
  139. dictionary.update({count: vg})
  140. tmp_array.append(vg)
  141. gen_index.append(count)
  142. else:
  143. for key, val in dictionary.items():
  144. if vg == val:
  145. gen_index.append(key)
  146. break
  147. return gen_index