123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- # -*- coding: utf-8 -*-
- """
- Main module of top down greedy anonymizaiton algorithm
- """
- import operator
- import pdb
- import random
- import time
- from functools import cmp_to_key
- from basic_mondrian.models.numrange import NumRange
- from basic_mondrian.utils.utility import cmp_str, get_num_list_from_str
- __DEBUG = False
- QI_LEN = 5
- GL_K = 0
- RESULT = []
- ATT_TREES = []
- QI_RANGE = []
- ROUNDS = 3
- IS_CAT = []
- SA_INDEX = []
- class Partition(object):
- """
- Class for Group, which is used to keep records
- Store tree node in instances.
- self.member: records in group
- self.middle: save the generalization result of this partition
- """
- def __init__(self, data, middle):
- """
- initialize with data and middle
- """
- self.can_split = True
- self.member = data[:]
- self.middle = middle[:]
- def __len__(self):
- """
- return the number of records in partition
- """
- return len(self.member)
- def NCP(record):
- """
- compute Certainlty Penalty of records
- """
- record_ncp = 0.0
- for i in range(QI_LEN):
- if IS_CAT[i] is False:
- value_ncp = 0
- try:
- float(record[i])
- except ValueError:
- split_number = record[i].split(',')
- value_ncp = float(split_number[1]) - float(split_number[0])
- value_ncp = value_ncp * 1.0 / QI_RANGE[i]
- record_ncp += value_ncp
- else:
- record_ncp += len(ATT_TREES[i][record[i]]) * 1.0 / QI_RANGE[i]
- return record_ncp
- def NCP_dis(record1, record2):
- """
- use the NCP of generalization record1 and record2 as distance
- """
- mid = middle_record(record1, record2)
- return NCP(mid), mid
- def NCP_dis_merge(partition, addition_set):
- """
- merge addition_set to current partition,
- update current partition.middle
- """
- mid = middle_group(addition_set)
- mid = middle_record(mid, partition.middle)
- return (len(addition_set) + len(partition)) * NCP(mid), mid
- def NCP_dis_group(record, partition):
- """
- compute the NCP of record and partition
- """
- mid = middle_record(record, partition.middle)
- ncp = NCP(mid)
- return (1 + len(partition)) * ncp
- def middle_record(record1, record2):
- """
- get the generalization result of record1 and record2
- """
- mid = []
- for i in range(QI_LEN):
- if IS_CAT[i] is False:
- split_number = []
- split_number.extend(get_num_list_from_str(record1[i]))
- split_number.extend(get_num_list_from_str(record2[i]))
- split_number.sort(key=cmp_to_key(cmp_str))
- # avoid 2,2 problem
- if split_number[0] == split_number[-1]:
- mid.append(split_number[0])
- else:
- mid.append(split_number[0] + ',' + split_number[-1])
- else:
- mid.append(LCA(record1[i], record2[i], i))
- return mid
- def middle_group(group_set):
- """
- get the generalization result of the group
- """
- len_group_set = len(group_set)
- mid = group_set[0]
- for i in range(1, len_group_set):
- mid = middle_record(mid, group_set[i])
- return mid
- def LCA(u, v, index):
- """
- get lowest common ancestor of u, v on generalization hierarchy (index)
- """
- gen_tree = ATT_TREES[index]
- # don't forget to add themselves (other the level will be higher)
- u_parent = list(gen_tree[u].parent)
- u_parent.insert(0, gen_tree[u])
- v_parent = list(gen_tree[v].parent)
- v_parent.insert(0, gen_tree[v])
- min_len = min(len(u_parent), len(v_parent))
- if min_len == 0:
- return '*'
- last = -1
- for i in range(min_len):
- pos = - 1 - i
- if u_parent[pos] != v_parent[pos]:
- break
- last = pos
- return u_parent[last].value
- def get_pair(partition):
- """
- To get max distance pair in partition, we need O(n^2) running time.
- The author proposed a heuristic method: random pick u and get max_dis(u, v)
- with O(n) running tiem; then pick max(v, u2)...after run ROUNDS times.
- the dis(u, v) is nearly max.
- """
- len_partition = len(partition)
- for i in range(ROUNDS):
- if i == 0:
- u = random.randrange(len_partition)
- else:
- u = v
- max_dis = -1
- max_index = 0
- for i in range(len_partition):
- if i != u:
- rncp, _ = NCP_dis(partition.member[i], partition.member[u])
- if rncp > max_dis:
- max_dis = rncp
- max_index = i
- v = max_index
- return (u, v)
- def distribute_record(u, v, partition):
- """
- Distribute records based on NCP distance.
- records will be assigned to nearer group.
- """
- record_u = partition.member[u][:]
- record_v = partition.member[v][:]
- u_partition = [record_u]
- v_partition = [record_v]
- remain_records = [item for index, item in enumerate(partition.member) if index not in set([u, v])]
- for record in remain_records:
- u_dis, _ = NCP_dis(record_u, record)
- v_dis, _ = NCP_dis(record_v, record)
- if u_dis > v_dis:
- v_partition.append(record)
- else:
- u_partition.append(record)
- return [Partition(u_partition, middle_group(u_partition)),
- Partition(v_partition, middle_group(v_partition))]
- def balance(sub_partitions, index):
- """
- Two kinds of balance methods.
- 1) Move some records from other groups
- 2) Merge with nearest group
- The algorithm will choose one of them with minimal NCP
- index store the sub_partition with less than k records
- """
- less = sub_partitions.pop(index)
- more = sub_partitions.pop()
- all_length = len(less) + len(more)
- require = GL_K - len(less)
- # First method
- dist = {}
- for i, record in enumerate(more.member):
- dist[i], _ = NCP_dis(less.middle, record)
- sorted_dist = sorted(dist.items(),
- key=operator.itemgetter(1))
- nearest_index = [t[0] for t in sorted_dist[:require]]
- addition_set = [t for i, t in enumerate(more.member) if i in set(nearest_index)]
- remain_set = [t for i, t in enumerate(more.member) if i not in set(nearest_index)]
- first_ncp, first_mid = NCP_dis_merge(less, addition_set)
- r_middle = middle_group(remain_set)
- first_ncp += len(remain_set) * NCP(r_middle)
- # Second method
- second_ncp, second_mid = NCP_dis(less.middle, more.middle)
- second_ncp *= all_length
- if first_ncp <= second_ncp:
- less.member.extend(addition_set)
- less.middle = first_mid
- more.member = remain_set
- more.middle = r_middle
- sub_partitions.append(more)
- else:
- less.member.extend(more.member)
- less.middle = second_mid
- less.can_split = False
- sub_partitions.append(less)
- def can_split(partition):
- """
- check if partition can be further splited.
- """
- if partition.can_split is False:
- return False
- if len(partition) < 2 * GL_K:
- return False
- return True
- def anonymize(partition):
- """
- Main procedure of top_down_greedy_anonymization.
- recursively partition groups until not allowable.
- """
- if can_split(partition) is False:
- RESULT.append(partition)
- return
- u, v = get_pair(partition)
- sub_partitions = distribute_record(u, v, partition)
- if len(sub_partitions[0]) < GL_K:
- balance(sub_partitions, 0)
- elif len(sub_partitions[1]) < GL_K:
- balance(sub_partitions, 1)
- # watch dog
- p_sum = len(partition)
- c_sum = 0
- for sub_partition in sub_partitions:
- c_sum += len(sub_partition)
- if p_sum != c_sum:
- pdb.set_trace()
- for sub_partition in sub_partitions:
- anonymize(sub_partition)
- def init(att_trees, data, k, QI_num, SA_num):
- """
- reset all gloabl variables
- """
- global GL_K, RESULT, QI_LEN, ATT_TREES, QI_RANGE, IS_CAT, SA_INDEX
- ATT_TREES = att_trees
- for t in att_trees:
- if isinstance(t, NumRange):
- IS_CAT.append(False)
- else:
- IS_CAT.append(True)
- QI_LEN = QI_num
- SA_INDEX = SA_num
- GL_K = k
- RESULT = []
- QI_RANGE = []
- def Top_Down_Greedy_Anonymization(att_trees, data, k, QI_num, SA_num):
- """
- Top Down Greedy Anonymization is a heuristic algorithm
- for relational dataset with numeric and categorical attbitues
- """
- init(att_trees, data, k, QI_num, SA_num)
- result = []
- middle = []
- for i in range(QI_LEN):
- if IS_CAT[i] is False:
- QI_RANGE.append(ATT_TREES[i].range)
- middle.append(ATT_TREES[i].value)
- else:
- QI_RANGE.append(len(ATT_TREES[i]['*']))
- middle.append('*')
- whole_partition = Partition(data, middle)
- start_time = time.time()
- anonymize(whole_partition)
- rtime = float(time.time() - start_time)
- ncp = 0.0
- dp = 0.0
- for sub_partition in RESULT:
- rncp = 0.0
- gen_result = sub_partition.middle
- rncp = NCP(gen_result)
- for i in range(len(sub_partition)):
- temp_for_SA = []
- for s in range(len(sub_partition.member[i]) - len(SA_INDEX), len(sub_partition.member[i])):
- temp_for_SA = temp_for_SA + [sub_partition.member[i][s]]
- result.append(gen_result[:] + temp_for_SA)
- rncp *= len(sub_partition)
- dp += len(sub_partition) ** 2
- ncp += rncp
- # covert NCP to percentage
- ncp /= len(data)
- ncp /= QI_LEN
- ncp *= 100
- if __DEBUG:
- from decimal import Decimal
- print("Discernability Penalty=%.2E" % Decimal(str(dp)))
- print("K=%d" % k)
- print("size of partitions")
- print(len(RESULT))
- print([len(partition) for partition in RESULT])
- print("NCP = %.2f %%" % ncp)
- print("Total running time = %.2f" % rtime)
- return (result, (ncp, rtime))
|