main.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import csv
  4. import math
  5. import os
  6. import sys
  7. from elemam.algorithm import calculate, sort
  8. from elemam.kanon import AnonCheck
  9. from elemam.node import Node
  10. sys.path.insert(1, os.path.join(sys.path[0], '..'))
  11. from utils.data import transform_columns
  12. def create_anon_data(best_nodes, raw_data, qi_index, gen_strat, kanon, res_folder):
  13. """Create anonymized data in 2d list format
  14. :param qi_index: List of QI column indexes
  15. :param best_node: Node that is used to anonymized the data
  16. :param raw_data: Raw data read from the csv file
  17. :param gen_strat: List containing the generalization strategies for the QI
  18. :return: 2d list: anon_data[row][col]
  19. """
  20. anon_data_list = {}
  21. for node in best_nodes:
  22. anon_data = [[0 for _ in range(len(raw_data))] for _ in range(len(raw_data[0]))]
  23. for row in range(len(raw_data[0])):
  24. gen_strat_iter = iter(gen_strat)
  25. attributes_iter = iter(node.attributes)
  26. for col in range(len(raw_data)):
  27. raw_value = raw_data[col][row]
  28. if col in qi_index:
  29. attribute = next(attributes_iter)
  30. if attribute == 0:
  31. next(gen_strat_iter)
  32. anon_data[row][col] = raw_value
  33. continue
  34. strat = next(gen_strat_iter)
  35. level = attribute - 1
  36. # Generate the anonymized value
  37. if isinstance(strat, list):
  38. args = []
  39. for arg_len in range(1, len(strat)):
  40. args.append(strat[arg_len])
  41. vg = strat[0](raw_value, level, *tuple(args))
  42. else:
  43. vg = strat(raw_value, level)
  44. else:
  45. vg = [raw_value]
  46. anon_data[row][col] = vg[0]
  47. eq_classes_dict = {}
  48. for row in range(len(raw_data[0])):
  49. key = tuple(anon_data[row][col] for col in qi_index)
  50. try:
  51. eq_classes_dict[key][0] += 1
  52. eq_classes_dict[key][1].append(row)
  53. except KeyError:
  54. eq_classes_dict.update({key: [1, [row]]})
  55. suppressed_count = 0
  56. suppressed_rows = []
  57. for vals in eq_classes_dict.values():
  58. if vals[0] < kanon:
  59. suppressed_count += vals[0]
  60. for row in vals[1]:
  61. suppressed_rows.append(row)
  62. for col in qi_index:
  63. anon_data[row][col] = "*"
  64. print("Suppressed: " + str(suppressed_count) + " rows")
  65. writer = csv.writer(open(os.path.join(res_folder, "supprarray.csv"), "a+"))
  66. writer.writerow(str(suppressed_count))
  67. anon_data_list.update({tuple(node.attributes): anon_data})
  68. return anon_data_list
  69. def main(raw_data, kanon, gen_strat, max_gen_level, qi_index, metric, res_folder, suppression_rate=0):
  70. raw_data = transform_columns(raw_data)
  71. qi_data = [raw_data[i] for i in qi_index]
  72. # Get number of QI
  73. quasi_ident_count = len(qi_index)
  74. # Define the suppression limit
  75. allowed_suppressed = int(len(qi_data[0]) * (suppression_rate / 100))
  76. # Create object for checking anonymity
  77. ac = AnonCheck(qi_data, max_gen_level, gen_strat, allowed_suppressed, kanon)
  78. node_array = []
  79. level_nodes = {}
  80. # Creating Node-structure starting with root-node
  81. rootnode = Node([0] * quasi_ident_count, node_array, max_gen_level, level_nodes)
  82. # Sorting list by height in the generalization lattice
  83. sorted_array = sort(node_array)
  84. # Evaluate generalization hierarchy with the OLA(ELEmam) algorithm
  85. # Returns lowest Nodes in a generalization path
  86. min_k = calculate(sorted_array, ac)
  87. # Calculation the best Node by the percision metric
  88. prec = 1
  89. lvl = 999999999
  90. eqcount = 9999999
  91. penalty = None
  92. loss = 999999999
  93. best_nodes = []
  94. if metric == "ent":
  95. test = create_anon_data(min_k, raw_data, qi_index, gen_strat, kanon, res_folder)
  96. for node in min_k:
  97. if metric == "prec":
  98. if node.prec < prec:
  99. prec = node.prec
  100. best_nodes = [node]
  101. elif metric == "gweight":
  102. if node.level < lvl:
  103. lvl = node.level
  104. best_nodes = [node]
  105. elif metric == "aecs":
  106. if node.eqclasses == eqcount:
  107. print("Bad")
  108. if node.eqclasses != 0 and node.eqclasses < eqcount:
  109. eqcount = node.eqclasses
  110. best_nodes = [node]
  111. elif metric == "dm":
  112. if penalty is None or node.DM_penalty < penalty:
  113. penalty = node.DM_penalty
  114. best_nodes = [node]
  115. elif metric == "dms":
  116. if node.DMs_penalty < penalty:
  117. penalty = node.DMs_penalty
  118. best_nodes = [node]
  119. elif metric == "ent":
  120. print("Metric ENT")
  121. new_loss = 0
  122. dictarray_r = []
  123. dictarray_g = []
  124. for col in range(len(qi_data)):
  125. d_r = {}
  126. d_g = {}
  127. for row in range(len(qi_data[0])):
  128. i_r = qi_data[col][row]
  129. i_g = test[tuple(node.attributes)][row][qi_index[col]]
  130. if i_r in d_r:
  131. d_r[i_r] += 1
  132. else:
  133. d_r.update({i_r: 1})
  134. if i_g in d_g:
  135. d_g[i_g] += 1
  136. else:
  137. d_g.update({i_g: 1})
  138. dictarray_r.append(d_r)
  139. dictarray_g.append(d_g)
  140. for k in range(len(qi_data[0])):
  141. for j in range(len(qi_data)):
  142. r_val = qi_data[j][k]
  143. g_val = test[tuple(node.attributes)][k][qi_index[j]]
  144. r_count = dictarray_r[j][r_val]
  145. g_count = dictarray_g[j][g_val]
  146. new_loss += math.log(r_count / g_count, 2)
  147. if new_loss < loss:
  148. loss = new_loss
  149. best_nodes = [node]
  150. # Print best generalisation
  151. print(best_nodes[0].attributes)
  152. with open(os.path.join(res_folder, 'genarray.csv'), 'a+') as gen_file:
  153. writer = csv.writer(gen_file)
  154. writer.writerow(best_nodes[0].attributes)
  155. if metric == "none":
  156. best_nodes = min_k
  157. return create_anon_data(best_nodes, raw_data, qi_index, gen_strat, kanon, res_folder), best_nodes[0].attributes