anonymization.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. import os
  2. import numpy as np
  3. import pandas as pd
  4. import argparse
  5. import json
  6. from datetime import datetime
  7. from utils.data import write_anon, read_raw_fromdf
  8. from utils.types import AnonMethod
  9. from clustering_based.anonymizer import get_result_one as cb_get_result_one
  10. from basic_mondrian.anonymizer import get_result_one
  11. from basic_mondrian.utils.read_adult_data import read_tree
  12. from datasets.categorical import DATASET_ATTRIBUTES_DICT
  13. def main(conf):
  14. anon_method = conf['algo_conf']['algo']
  15. k = conf['algo_conf']['k']
  16. dataset = conf['dataset']
  17. batch_size = conf['dataset_conf']['batch_size']
  18. attrs = conf['dataset_conf']['attrs']
  19. # define necessary paths
  20. # Data path
  21. path = os.path.join('datasets', dataset, '') # trailing /
  22. # Dataset path
  23. data_path = os.path.join(path, f'{dataset}.csv')
  24. # Generalization hierarchies path
  25. gen_path = os.path.join('generalization', 'hierarchies', dataset, '') # trailing /
  26. # folder for all results
  27. res_folder = os.path.join('results', dataset, anon_method, str(batch_size), str(k),
  28. datetime.utcnow().isoformat().replace(':', '_'))
  29. # path for anonymized datasets
  30. anon_folder = os.path.join(res_folder, 'anon_dataset', '') # trailing /
  31. # path for pickled numeric values
  32. numeric_folder = os.path.join(res_folder, 'numeric')
  33. # create path needed for results recursively
  34. os.makedirs(anon_folder)
  35. os.makedirs(numeric_folder)
  36. # reading in the data
  37. data = pd.read_csv(data_path, delimiter=';')
  38. if dataset == "adult":
  39. data = data.drop(labels=["ID"], axis="columns")
  40. print(
  41. 'Original Data: ' + str(data.shape[0]) + ' entries, ' + str(data.shape[1]) + ' attributes')
  42. # Get batch
  43. if batch_size > 0:
  44. np.random.seed(30)
  45. rand_idx = np.random.randint(data.shape[0], size=batch_size)
  46. data = data.iloc[rand_idx, :]
  47. # Get the specific columns
  48. if attrs:
  49. data = data[attrs]
  50. print(f'Batch to process: {data.shape[0]} users, {data.shape[1]} attributes')
  51. ATT_NAMES = list(data.columns)
  52. ATTRIBUTES_DICT = DATASET_ATTRIBUTES_DICT[dataset]
  53. QI_INDEX = [i for i,attr in enumerate(ATT_NAMES) if ATTRIBUTES_DICT[attr][0]]
  54. IS_CAT = [ATTRIBUTES_DICT[ATT_NAMES[idx]][1] for idx in QI_INDEX]
  55. SA_INDEX = [index for index in range(len(ATT_NAMES)) if index not in QI_INDEX]
  56. QI_NAMES = list(np.array(ATT_NAMES)[QI_INDEX])
  57. SA_var = [ATT_NAMES[i] for i in SA_INDEX]
  58. raw_data, header = read_raw_fromdf(data, numeric_folder, dataset, QI_INDEX, IS_CAT)
  59. #raw_data, header = read_raw(path, numeric_folder, dataset, QI_INDEX, IS_CAT)
  60. ATT_TREES = read_tree(gen_path, numeric_folder, dataset, ATT_NAMES, QI_INDEX, IS_CAT)
  61. anon_data, data_util, run_time = None, None, None
  62. s = 0
  63. s_folder = os.path.join(anon_folder, 's_' + str(s))
  64. os.mkdir(s_folder)
  65. rnd = 36
  66. np.random.seed(rnd)
  67. if anon_method == AnonMethod.MONDRIAN:
  68. anon_data, data_util, run_time, = get_result_one(ATT_TREES, raw_data, k, path, QI_INDEX, SA_INDEX)
  69. elif anon_method == 'oka':
  70. anon_data, data_util, run_time, = cb_get_result_one(ATT_TREES, raw_data, k, path, QI_INDEX, SA_INDEX, 'oka')
  71. elif anon_method == 'kmember':
  72. anon_data, data_util, run_time, = cb_get_result_one(ATT_TREES, raw_data, k, path, QI_INDEX, SA_INDEX, 'kmember')
  73. elif anon_method == 'knn':
  74. anon_data, data_util, run_time, = cb_get_result_one(ATT_TREES, raw_data, k, path, QI_INDEX,
  75. SA_INDEX, 'knn')
  76. # Write anonymized data in csv file
  77. nodes_count = write_anon(s_folder, anon_data, header, k, s, dataset)
  78. res = f"{anon_method},{batch_size},{k},{len(QI_INDEX)},{data_util},{run_time}"
  79. return res
  80. if __name__ == "__main__":
  81. parser = argparse.ArgumentParser()
  82. parser.add_argument('filename')
  83. args = parser.parse_args()
  84. with open(args.filename) as f:
  85. conf_list = json.load(f)
  86. results = []
  87. for k in conf_list['algo_conf']['k']:
  88. if k < 2:
  89. print("invalid k value")
  90. exit(1)
  91. for algo in conf_list['algo_conf']['algo']:
  92. for batch_size in conf_list['dataset_conf']['batch_size']:
  93. for attrs in conf_list['dataset_conf']['attrs']:
  94. conf = {
  95. "dataset": conf_list['dataset'],
  96. "algo_conf": {
  97. "k": k,
  98. "algo": algo
  99. },
  100. "dataset_conf": {
  101. "batch_size": batch_size,
  102. "attrs": attrs,
  103. }
  104. }
  105. res = main(conf)
  106. results.append(res)
  107. print(f"Algorithm,BatchSize,k,n_att,ncp,run_time")
  108. for res in results:
  109. print(res)