long.nguyen
/
thesis-medical-data-collection


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
							# -*- coding: utf-8 -*-

import math
from generalization.hierarchy_utilities import read_gen_hierarchy


def hierarchy(path: str, qi_name: str)-> list:
    """Read QI hierarchy from file.
    Filenames follow a specific format: gen_hier_<gi_name>.csv

    :param path: Path to where the hierarchy file is stored
    :param qi_name: Name of the QI (header in dataset)
    :return: list: List of function+argument for that type of generalization
    """
    return [substitution, read_gen_hierarchy(path, qi_name)]


def age(data, level):
    """Transforms age data to a predefined generalization state.

    :param data: list or string: Data containing one or more age values
    :param level: int: Generalization step to which the data should be trasformed
    :return: List of generalized age data
    """
    # Use the generic function "segmentation" with predefined arguments
    return segmentation(data, level, 1, 100, [5, 10, 20, "*"])


def segmentation(data, level, min_num, max_num, div_list):
    """Transforms numerical data to a segementated state

    Parameters:
        data: list or string
            Data containing one or more numerical values
        level: int
            Generalization step to which the data should be trasformed
        min_num: int
            Start of numeric range
        max_num: int
            End of numeric range
        div_list: list
            Contains value in what range data should be grouped for each generalization step
    Returns:
        List of generalized numerical data
    """
    ret = []
    # Check if data is already a list/range or if it is a single value
    if not isinstance(data, list) and not isinstance(data, range):
        values = [int(data)]
    else:
        values = list(map(int, data))

    seg = div_list[level]

    # Check if the last level is not an integer segmentation and thus a substitution
    if len(div_list)-1 == level and not isinstance(seg, int):
        return l1sub(values, seg)

    groups = range(0, math.floor((max_num+1-min_num)/seg))
    div_max = min_num + seg + seg * groups[-1]

    for value in values:
        # Check if a value is bigger than the calculated max segementation value
        if value >= div_max:
            # Cut larger value to fit the segementation
            value = div_max - 1

        # Check in what group the value belongs
        for i in groups:
            b = min_num + seg * i
            e = b + seg
            if b <= value < e:
                e -= 1
                ret.append(str(b) + "-" + str(e))
                break
    return ret


def zip_code(data, level):
    """Transforms zipcode data to a predefined generalization state.

    Parameters:
        data: list or string
            Data containing one or more zipcode values
        level: int
            Generalization step to which the data should be trasformed
    Returns:
        List of generalized zipcode data
    """
    # Use the generic function "removeal" with predefined arguments
    return removeal(data, level, 1)


def removeal(data, level, steps):
    """Transforms zipcode data to a generalization state with removed characters.

    Parameters:
        data: list or string
            Data containing one or more generic values
        level: int
            Generalization step to which the data should be trasformed
        steps: int
            How many characters should be removed per level
    Returns:
        List of generalized data
    """
    ret = []
    # Check if data is already a list or if it is a single value
    if not isinstance(data, list):
        values = [data]
    else:
        values = data

    # How many characters to remove this level
    char_num = (level+1)*steps

    # Check if every character would be removed
    if char_num >= len(str(values[0])):
        return l1sub(values, level)

    for v in values:
        v = list(str(v))
        # Replace every character that gets removed with *
        for n in range(char_num):
            v[(-1-n)] = '*'
        ret.append("".join(v))
    return ret


def birthdate(data, level, min_year, max_year):
    """Transforms birthdate data to a predefined generalization state.

    Parameters:
        data: list or string
            Data containing one or more birthdate values (DD.MM.YYYY)
        level: int
            Generalization step to which the data should be trasformed
        min_year: int
            First year of dataset range
        max_year: int
            Last year of dataset range
    Returns:
        List of generalized birthdate data
    """
    ret = []
    if not isinstance(data, list):
        values = [data]
    else:
        values = data

    # Remove parts of date string
    for v in values:
        ret.append(v.split(".", level + 1)[-1])

    # If last generalization level is reached, apply segementation of the year
    if level >= 2:
        ret = list(map(int, ret))
        ret = segmentation(ret, 0, min_year, max_year, [10])

    return ret


def l1sub(data, placeholder):
    """Substitutes data with a character (default: *).

    :param data: list or string: Data containing one or more values
    :param placeholder: int or string:
                        <br />If int: Not used in code but allows to call this function with the standard (data,level) format
                        <br />If string: used to replace the default sub character *
    :return: List of generalized data
    """
    if isinstance(placeholder, int):
        sub_char = '*'
    else:
        sub_char = placeholder

    if not isinstance(data, list):
        values = [data]
    else:
        values = data

    return [sub_char]*len(values)


def substitution(data, level, wordlists):
    """Transforms birthdate data to a generalization state with substituted values.

    Parameters:
        data: list or string
            Data containing one or more values
        level: int
            Generalization step to which the data should be trasformed
        wordlists:
            List of dictionaries with subsition keys for each dataentry as values of the key
            Each dictionary represents a generalization level
    Returns:
        List of generalized data
    """
    ret = []
    if not isinstance(data, list):
        values = [data]
    else:
        values = data

    # Check if no more substitution is found
    if level > len(wordlists)-1:
        return l1sub(data, level)

    # Select right dictionary
    wordlist = wordlists[level]

    for value in values:
        # Search for value in dictionary
        for k, v in wordlist.items():
            if value in v:
                ret.append(k)
    return ret


if __name__ == '__main__':
    print("This is a module!")
    exit(1)