long.nguyen
/
thesis-medical-data-collection


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
							# -*- coding: utf-8 -*-

"""
Main module of top down greedy anonymizaiton algorithm
"""

import operator
import pdb
import random
import time
from functools import cmp_to_key

from basic_mondrian.models.numrange import NumRange
from basic_mondrian.utils.utility import cmp_str, get_num_list_from_str

__DEBUG = False
QI_LEN = 5
GL_K = 0
RESULT = []
ATT_TREES = []
QI_RANGE = []
ROUNDS = 3
IS_CAT = []
SA_INDEX = []


class Partition(object):

    """
    Class for Group, which is used to keep records
    Store tree node in instances.
    self.member: records in group
    self.middle: save the generalization result of this partition
    """

    def __init__(self, data, middle):
        """
        initialize with data and middle
        """
        self.can_split = True
        self.member = data[:]
        self.middle = middle[:]

    def __len__(self):
        """
        return the number of records in partition
        """
        return len(self.member)


def NCP(record):
    """
    compute Certainlty Penalty of records
    """
    record_ncp = 0.0
    for i in range(QI_LEN):
        if IS_CAT[i] is False:
            value_ncp = 0
            try:
                float(record[i])
            except ValueError:
                split_number = record[i].split(',')
                value_ncp = float(split_number[1]) - float(split_number[0])
            value_ncp = value_ncp * 1.0 / QI_RANGE[i]
            record_ncp += value_ncp
        else:
            record_ncp += len(ATT_TREES[i][record[i]]) * 1.0 / QI_RANGE[i]
    return record_ncp


def NCP_dis(record1, record2):
    """
    use the NCP of generalization record1 and record2 as distance
    """
    mid = middle_record(record1, record2)
    return NCP(mid), mid


def NCP_dis_merge(partition, addition_set):
    """
    merge addition_set to current partition,
    update current partition.middle
    """
    mid = middle_group(addition_set)
    mid = middle_record(mid, partition.middle)
    return (len(addition_set) + len(partition)) * NCP(mid), mid


def NCP_dis_group(record, partition):
    """
    compute the NCP of record and partition
    """
    mid = middle_record(record, partition.middle)
    ncp = NCP(mid)
    return (1 + len(partition)) * ncp


def middle_record(record1, record2):
    """
    get the generalization result of record1 and record2
    """
    mid = []
    for i in range(QI_LEN):
        if IS_CAT[i] is False:
            split_number = []
            split_number.extend(get_num_list_from_str(record1[i]))
            split_number.extend(get_num_list_from_str(record2[i]))
            split_number.sort(key=cmp_to_key(cmp_str))
            # avoid 2,2 problem
            if split_number[0] == split_number[-1]:
                mid.append(split_number[0])
            else:
                mid.append(split_number[0] + ',' + split_number[-1])
        else:
            mid.append(LCA(record1[i], record2[i], i))
    return mid


def middle_group(group_set):
    """
    get the generalization result of the group
    """
    len_group_set = len(group_set)
    mid = group_set[0]
    for i in range(1, len_group_set):
        mid = middle_record(mid, group_set[i])
    return mid


def LCA(u, v, index):
    """
    get lowest common ancestor of u, v on generalization hierarchy (index)
    """
    gen_tree = ATT_TREES[index]
    # don't forget to add themselves (other the level will be higher)
    u_parent = list(gen_tree[u].parent)
    u_parent.insert(0, gen_tree[u])
    v_parent = list(gen_tree[v].parent)
    v_parent.insert(0, gen_tree[v])
    min_len = min(len(u_parent), len(v_parent))
    if min_len == 0:
        return '*'
    last = -1
    for i in range(min_len):
        pos = - 1 - i
        if u_parent[pos] != v_parent[pos]:
            break
        last = pos
    return u_parent[last].value


def get_pair(partition):
    """
    To get max distance pair in partition, we need O(n^2) running time.
    The author proposed a heuristic method: random pick u and get max_dis(u, v)
    with O(n) running tiem; then pick max(v, u2)...after run ROUNDS times.
    the dis(u, v) is nearly max.
    """
    len_partition = len(partition)
    for i in range(ROUNDS):
        if i == 0:
            u = random.randrange(len_partition)
        else:
            u = v
        max_dis = -1
        max_index = 0
        for i in range(len_partition):
            if i != u:
                rncp, _ = NCP_dis(partition.member[i], partition.member[u])
                if rncp > max_dis:
                    max_dis = rncp
                    max_index = i
        v = max_index
    return (u, v)


def distribute_record(u, v, partition):
    """
    Distribute records based on NCP distance.
    records will be assigned to nearer group.
    """
    record_u = partition.member[u][:]
    record_v = partition.member[v][:]
    u_partition = [record_u]
    v_partition = [record_v]
    remain_records = [item for index, item in enumerate(partition.member) if index not in set([u, v])]
    for record in remain_records:
        u_dis, _ = NCP_dis(record_u, record)
        v_dis, _ = NCP_dis(record_v, record)
        if u_dis > v_dis:
            v_partition.append(record)
        else:
            u_partition.append(record)
    return [Partition(u_partition, middle_group(u_partition)),
            Partition(v_partition, middle_group(v_partition))]


def balance(sub_partitions, index):
    """
    Two kinds of balance methods.
    1) Move some records from other groups
    2) Merge with nearest group
    The algorithm will choose one of them with minimal NCP
    index store the sub_partition with less than k records
    """
    less = sub_partitions.pop(index)
    more = sub_partitions.pop()
    all_length = len(less) + len(more)
    require = GL_K - len(less)
    # First method
    dist = {}
    for i, record in enumerate(more.member):
        dist[i], _ = NCP_dis(less.middle, record)

    sorted_dist = sorted(dist.items(),
                         key=operator.itemgetter(1))
    nearest_index = [t[0] for t in sorted_dist[:require]]
    addition_set = [t for i, t in enumerate(more.member) if i in set(nearest_index)]
    remain_set = [t for i, t in enumerate(more.member) if i not in set(nearest_index)]
    first_ncp, first_mid = NCP_dis_merge(less, addition_set)
    r_middle = middle_group(remain_set)
    first_ncp += len(remain_set) * NCP(r_middle)
    # Second method
    second_ncp, second_mid = NCP_dis(less.middle, more.middle)
    second_ncp *= all_length
    if first_ncp <= second_ncp:
        less.member.extend(addition_set)
        less.middle = first_mid
        more.member = remain_set
        more.middle = r_middle
        sub_partitions.append(more)
    else:
        less.member.extend(more.member)
        less.middle = second_mid
        less.can_split = False
    sub_partitions.append(less)


def can_split(partition):
    """
    check if partition can be further splited.
    """
    if partition.can_split is False:
        return False
    if len(partition) < 2 * GL_K:
        return False
    return True


def anonymize(partition):
    """
    Main procedure of top_down_greedy_anonymization.
    recursively partition groups until not allowable.
    """
    if can_split(partition) is False:
        RESULT.append(partition)
        return
    u, v = get_pair(partition)
    sub_partitions = distribute_record(u, v, partition)
    if len(sub_partitions[0]) < GL_K:
        balance(sub_partitions, 0)
    elif len(sub_partitions[1]) < GL_K:
        balance(sub_partitions, 1)
    # watch dog
    p_sum = len(partition)
    c_sum = 0
    for sub_partition in sub_partitions:
        c_sum += len(sub_partition)
    if p_sum != c_sum:
        pdb.set_trace()
    for sub_partition in sub_partitions:
        anonymize(sub_partition)


def init(att_trees, data, k, QI_num, SA_num):
    """
    reset all gloabl variables
    """
    global GL_K, RESULT, QI_LEN, ATT_TREES, QI_RANGE, IS_CAT, SA_INDEX
    ATT_TREES = att_trees
    for t in att_trees:
        if isinstance(t, NumRange):
            IS_CAT.append(False)
        else:
            IS_CAT.append(True)
    QI_LEN = QI_num
    SA_INDEX = SA_num
    GL_K = k
    RESULT = []
    QI_RANGE = []


def Top_Down_Greedy_Anonymization(att_trees, data, k, QI_num, SA_num):
    """
    Top Down Greedy Anonymization is a heuristic algorithm
    for relational dataset with numeric and categorical attbitues
    """
    init(att_trees, data, k, QI_num, SA_num)
    result = []
    middle = []
    for i in range(QI_LEN):
        if IS_CAT[i] is False:
            QI_RANGE.append(ATT_TREES[i].range)
            middle.append(ATT_TREES[i].value)
        else:
            QI_RANGE.append(len(ATT_TREES[i]['*']))
            middle.append('*')
    whole_partition = Partition(data, middle)
    start_time = time.time()
    anonymize(whole_partition)
    rtime = float(time.time() - start_time)
    ncp = 0.0
    dp = 0.0
    for sub_partition in RESULT:
        rncp = 0.0
        gen_result = sub_partition.middle
        rncp = NCP(gen_result)

        for i in range(len(sub_partition)):
            temp_for_SA = []
            for s in range(len(sub_partition.member[i]) - len(SA_INDEX), len(sub_partition.member[i])):
                temp_for_SA = temp_for_SA + [sub_partition.member[i][s]]
            result.append(gen_result[:] + temp_for_SA)
        rncp *= len(sub_partition)
        dp += len(sub_partition) ** 2
        ncp += rncp
    # covert NCP to percentage
    ncp /= len(data)
    ncp /= QI_LEN
    ncp *= 100
    if __DEBUG:
        from decimal import Decimal
        print("Discernability Penalty=%.2E" % Decimal(str(dp)))
        print("K=%d" % k)
        print("size of partitions")
        print(len(RESULT))
        print([len(partition) for partition in RESULT])
        print("NCP = %.2f %%" % ncp)
        print("Total running time = %.2f" % rtime)

    return (result, (ncp, rtime))