123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358 |
- # -*- coding: utf-8 -*-
- """
- main module of basic Mondrian
- """
- import pdb
- import time
- from functools import cmp_to_key
- from basic_mondrian.models.numrange import NumRange
- from basic_mondrian.utils.utility import cmp_str
- __DEBUG = False
- QI_LEN = 8
- SA_INDEX = []
- GL_K = 0
- RESULT = []
- ATT_TREES = []
- QI_RANGE = []
- IS_CAT = []
- class Partition(object):
- """Class for Group, which is used to keep records
- Store tree node in instances.
- self.member: records in group
- self.width: width of this partition on each domain. For categoric attribute, it equal
- the number of leaf node, for numeric attribute, it equal to number range
- self.middle: save the generalization result of this partition
- self.allow: 0 donate that not allow to split, 1 donate can be split
- """
- def __init__(self, data, width, middle):
- """
- initialize with data, width and middle
- """
- self.member = list(data)
- self.width = list(width)
- self.middle = list(middle)
- self.allow = [1] * QI_LEN
- def __len__(self):
- """
- return the number of records in partition
- """
- return len(self.member)
- def get_normalized_width(partition, index):
- """
- return Normalized width of partition
- similar to NCP
- """
- if IS_CAT[index] is False:
- low = partition.width[index][0]
- high = partition.width[index][1]
- width = float(ATT_TREES[index].sort_value[high]) - float(ATT_TREES[index].sort_value[low])
- else:
- width = partition.width[index]
- return width * 1.0 / QI_RANGE[index]
- def choose_dimension(partition):
- """
- chooss dim with largest normlized Width
- return dim index.
- """
- max_width = -1
- max_dim = -1
- for i in range(QI_LEN):
- if partition.allow[i] == 0:
- continue
- normWidth = get_normalized_width(partition, i)
- if normWidth > max_width:
- max_width = normWidth
- max_dim = i
- if max_width > 1:
- print("Error: max_width > 1")
- pdb.set_trace()
- if max_dim == -1:
- print("cannot find the max dim")
- pdb.set_trace()
- return max_dim
- def frequency_set(partition, dim):
- """
- get the frequency_set of partition on dim
- return dict{key: str values, values: count}
- """
- frequency = {}
- for record in partition.member:
- try:
- frequency[record[dim]] += 1
- except KeyError:
- frequency[record[dim]] = 1
- return frequency
- def find_median(partition, dim):
- """
- find the middle of the partition
- return splitVal
- """
- frequency = frequency_set(partition, dim)
- splitVal = ''
- value_list = list(frequency)
- value_list.sort(key=cmp_to_key(cmp_str))
- total = sum(frequency.values())
- middle = total / 2
- if middle < GL_K or len(value_list) <= 1:
- return ('', '', value_list[0], value_list[-1])
- index = 0
- split_index = 0
- for i, t in enumerate(value_list):
- index += frequency[t]
- if index >= middle:
- splitVal = t
- split_index = i
- break
- else:
- print("Error: cannot find splitVal")
- try:
- nextVal = value_list[split_index + 1]
- except IndexError:
- nextVal = splitVal
- return (splitVal, nextVal, value_list[0], value_list[-1])
- def split_numerical_value(numeric_value, splitVal):
- """
- split numeric value on splitVal
- return sub ranges
- """
- split_num = numeric_value.split(',')
- if len(split_num) <= 1:
- return split_num[0], split_num[0]
- else:
- low = split_num[0]
- high = split_num[1]
- # Fix 2,2 problem
- if low == splitVal:
- lvalue = low
- else:
- lvalue = low + ',' + splitVal
- if high == splitVal:
- rvalue = high
- else:
- rvalue = splitVal + ',' + high
- return lvalue, rvalue
- def split_numerical(partition, dim, pwidth, pmiddle):
- """
- strict split numeric attribute by finding a median,
- lhs = [low, means], rhs = (mean, high]
- """
- sub_partitions = []
- # numeric attributes
- (splitVal, nextVal, low, high) = find_median(partition, dim)
- #print(ATT_TREES[dim].dict, type(low))
- p_low = ATT_TREES[dim].dict[low]
- p_high = ATT_TREES[dim].dict[high]
- # update middle
- if low == high:
- pmiddle[dim] = low
- else:
- pmiddle[dim] = low + ',' + high
- pwidth[dim] = (p_low, p_high)
- if splitVal == '' or splitVal == nextVal:
- # update middle
- return []
- middle_pos = ATT_TREES[dim].dict[splitVal]
- lmiddle = pmiddle[:]
- rmiddle = pmiddle[:]
- lmiddle[dim], rmiddle[dim] = split_numerical_value(pmiddle[dim], splitVal)
- lhs = []
- rhs = []
- for temp in partition.member:
- pos = ATT_TREES[dim].dict[temp[dim]]
- if pos <= middle_pos:
- # lhs = [low, means]
- lhs.append(temp)
- else:
- # rhs = (mean, high]
- rhs.append(temp)
- lwidth = pwidth[:]
- rwidth = pwidth[:]
- lwidth[dim] = (pwidth[dim][0], middle_pos)
- rwidth[dim] = (ATT_TREES[dim].dict[nextVal], pwidth[dim][1])
- sub_partitions.append(Partition(lhs, lwidth, lmiddle))
- sub_partitions.append(Partition(rhs, rwidth, rmiddle))
- return sub_partitions
- def split_categorical(partition, dim, pwidth, pmiddle):
- """
- split categorical attribute using generalization hierarchy
- """
- sub_partitions = []
- # categoric attributes
- splitVal = ATT_TREES[dim][partition.middle[dim]]
- sub_node = [t for t in splitVal.child]
- sub_groups = []
- for i in range(len(sub_node)):
- sub_groups.append([])
- if len(sub_groups) == 0:
- # split is not necessary
- return []
- for temp in partition.member:
- qid_value = temp[dim]
- for i, node in enumerate(sub_node):
- try:
- node.cover[qid_value]
- sub_groups[i].append(temp)
- break
- except KeyError:
- continue
- else:
- print("Generalization hierarchy error!: " + qid_value)
- flag = True
- for index, sub_group in enumerate(sub_groups):
- if len(sub_group) == 0:
- continue
- if len(sub_group) < GL_K:
- flag = False
- break
- if flag:
- for i, sub_group in enumerate(sub_groups):
- if len(sub_group) == 0:
- continue
- wtemp = pwidth[:]
- mtemp = pmiddle[:]
- wtemp[dim] = len(sub_node[i])
- mtemp[dim] = sub_node[i].value
- sub_partitions.append(Partition(sub_group, wtemp, mtemp))
- return sub_partitions
- def split_partition(partition, dim):
- """
- split partition and distribute records to different sub-partitions
- """
- pwidth = partition.width
- pmiddle = partition.middle
- if IS_CAT[dim] is False:
- return split_numerical(partition, dim, pwidth, pmiddle)
- else:
- return split_categorical(partition, dim, pwidth, pmiddle)
- def anonymize(partition):
- """
- Main procedure of Half_Partition.
- recursively partition groups until not allowable.
- """
- if check_splitable(partition) is False:
- RESULT.append(partition)
- return
- # Choose dim
- dim = choose_dimension(partition)
- if dim == -1:
- print("Error: dim=-1")
- pdb.set_trace()
- sub_partitions = split_partition(partition, dim)
- if len(sub_partitions) == 0:
- partition.allow[dim] = 0
- anonymize(partition)
- else:
- for sub_p in sub_partitions:
- anonymize(sub_p)
- def check_splitable(partition):
- """
- Check if the partition can be further splited while satisfying k-anonymity.
- """
- temp = sum(partition.allow)
- if temp == 0:
- return False
- return True
- def init(att_trees, data, k, QI_num, SA_num):
- """
- reset all global variables
- """
- global GL_K, RESULT, QI_LEN, ATT_TREES, QI_RANGE, IS_CAT, SA_INDEX
- ATT_TREES = att_trees
- IS_CAT = []
- for t in att_trees:
- if isinstance(t, NumRange):
- IS_CAT.append(False)
- else:
- IS_CAT.append(True)
- QI_LEN = QI_num
- SA_INDEX = SA_num
- GL_K = k
- RESULT = []
- QI_RANGE = []
- def mondrian(att_trees, data, k, QI_num, SA_num):
- """
- basic Mondrian for k-anonymity.
- This fuction support both numeric values and categoric values.
- For numeric values, each iterator is a mean split.
- For categoric values, each iterator is a split on GH.
- The final result is returned in 2-dimensional list.
- """
- init(att_trees, data, k, QI_num, SA_num)
- result = []
- middle = []
- wtemp = []
- for i in range(QI_LEN):
- if IS_CAT[i] is False:
- QI_RANGE.append(ATT_TREES[i].range)
- wtemp.append((0, len(ATT_TREES[i].sort_value) - 1))
- middle.append(ATT_TREES[i].value)
- else:
- QI_RANGE.append(len(ATT_TREES[i]['*']))
- wtemp.append(len(ATT_TREES[i]['*']))
- middle.append('*')
- whole_partition = Partition(data, wtemp, middle)
- start_time = time.time()
- anonymize(whole_partition)
- rtime = float(time.time() - start_time)
- ncp = 0.0
- for partition in RESULT:
- r_ncp = 0.0
- for i in range(QI_LEN):
- r_ncp += get_normalized_width(partition, i)
- temp = partition.middle
- for i in range(len(partition)):
- temp_for_SA = []
- for s in range(len(partition.member[i]) - len(SA_INDEX), len(partition.member[i])):
- temp_for_SA = temp_for_SA + [partition.member[i][s]]
- result.append(temp + temp_for_SA)
- r_ncp *= len(partition)
- ncp += r_ncp
- # covert to NCP percentage
- ncp /= QI_LEN
- ncp /= len(data)
- ncp *= 100
- if len(result) != len(data):
- print("Losing records during anonymization!!")
- pdb.set_trace()
- if __DEBUG:
- print("K=%d" % k)
- print("size of partitions")
- print(len(RESULT))
- temp = [len(t) for t in RESULT]
- print(sorted(temp))
- print("NCP = %.2f %%" % ncp)
- return (result, (ncp, rtime))
|