generalization.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. # -*- coding: utf-8 -*-
  2. import math
  3. from generalization.hierarchy_utilities import read_gen_hierarchy
  4. def hierarchy(path: str, qi_name: str)-> list:
  5. """Read QI hierarchy from file.
  6. Filenames follow a specific format: gen_hier_<gi_name>.csv
  7. :param path: Path to where the hierarchy file is stored
  8. :param qi_name: Name of the QI (header in dataset)
  9. :return: list: List of function+argument for that type of generalization
  10. """
  11. return [substitution, read_gen_hierarchy(path, qi_name)]
  12. def age(data, level):
  13. """Transforms age data to a predefined generalization state.
  14. :param data: list or string: Data containing one or more age values
  15. :param level: int: Generalization step to which the data should be trasformed
  16. :return: List of generalized age data
  17. """
  18. # Use the generic function "segmentation" with predefined arguments
  19. return segmentation(data, level, 1, 100, [5, 10, 20, "*"])
  20. def segmentation(data, level, min_num, max_num, div_list):
  21. """Transforms numerical data to a segementated state
  22. Parameters:
  23. data: list or string
  24. Data containing one or more numerical values
  25. level: int
  26. Generalization step to which the data should be trasformed
  27. min_num: int
  28. Start of numeric range
  29. max_num: int
  30. End of numeric range
  31. div_list: list
  32. Contains value in what range data should be grouped for each generalization step
  33. Returns:
  34. List of generalized numerical data
  35. """
  36. ret = []
  37. # Check if data is already a list/range or if it is a single value
  38. if not isinstance(data, list) and not isinstance(data, range):
  39. values = [int(data)]
  40. else:
  41. values = list(map(int, data))
  42. seg = div_list[level]
  43. # Check if the last level is not an integer segmentation and thus a substitution
  44. if len(div_list)-1 == level and not isinstance(seg, int):
  45. return l1sub(values, seg)
  46. groups = range(0, math.floor((max_num+1-min_num)/seg))
  47. div_max = min_num + seg + seg * groups[-1]
  48. for value in values:
  49. # Check if a value is bigger than the calculated max segementation value
  50. if value >= div_max:
  51. # Cut larger value to fit the segementation
  52. value = div_max - 1
  53. # Check in what group the value belongs
  54. for i in groups:
  55. b = min_num + seg * i
  56. e = b + seg
  57. if b <= value < e:
  58. e -= 1
  59. ret.append(str(b) + "-" + str(e))
  60. break
  61. return ret
  62. def zip_code(data, level):
  63. """Transforms zipcode data to a predefined generalization state.
  64. Parameters:
  65. data: list or string
  66. Data containing one or more zipcode values
  67. level: int
  68. Generalization step to which the data should be trasformed
  69. Returns:
  70. List of generalized zipcode data
  71. """
  72. # Use the generic function "removeal" with predefined arguments
  73. return removeal(data, level, 1)
  74. def removeal(data, level, steps):
  75. """Transforms zipcode data to a generalization state with removed characters.
  76. Parameters:
  77. data: list or string
  78. Data containing one or more generic values
  79. level: int
  80. Generalization step to which the data should be trasformed
  81. steps: int
  82. How many characters should be removed per level
  83. Returns:
  84. List of generalized data
  85. """
  86. ret = []
  87. # Check if data is already a list or if it is a single value
  88. if not isinstance(data, list):
  89. values = [data]
  90. else:
  91. values = data
  92. # How many characters to remove this level
  93. char_num = (level+1)*steps
  94. # Check if every character would be removed
  95. if char_num >= len(str(values[0])):
  96. return l1sub(values, level)
  97. for v in values:
  98. v = list(str(v))
  99. # Replace every character that gets removed with *
  100. for n in range(char_num):
  101. v[(-1-n)] = '*'
  102. ret.append("".join(v))
  103. return ret
  104. def birthdate(data, level, min_year, max_year):
  105. """Transforms birthdate data to a predefined generalization state.
  106. Parameters:
  107. data: list or string
  108. Data containing one or more birthdate values (DD.MM.YYYY)
  109. level: int
  110. Generalization step to which the data should be trasformed
  111. min_year: int
  112. First year of dataset range
  113. max_year: int
  114. Last year of dataset range
  115. Returns:
  116. List of generalized birthdate data
  117. """
  118. ret = []
  119. if not isinstance(data, list):
  120. values = [data]
  121. else:
  122. values = data
  123. # Remove parts of date string
  124. for v in values:
  125. ret.append(v.split(".", level + 1)[-1])
  126. # If last generalization level is reached, apply segementation of the year
  127. if level >= 2:
  128. ret = list(map(int, ret))
  129. ret = segmentation(ret, 0, min_year, max_year, [10])
  130. return ret
  131. def l1sub(data, placeholder):
  132. """Substitutes data with a character (default: *).
  133. :param data: list or string: Data containing one or more values
  134. :param placeholder: int or string:
  135. <br />If int: Not used in code but allows to call this function with the standard (data,level) format
  136. <br />If string: used to replace the default sub character *
  137. :return: List of generalized data
  138. """
  139. if isinstance(placeholder, int):
  140. sub_char = '*'
  141. else:
  142. sub_char = placeholder
  143. if not isinstance(data, list):
  144. values = [data]
  145. else:
  146. values = data
  147. return [sub_char]*len(values)
  148. def substitution(data, level, wordlists):
  149. """Transforms birthdate data to a generalization state with substituted values.
  150. Parameters:
  151. data: list or string
  152. Data containing one or more values
  153. level: int
  154. Generalization step to which the data should be trasformed
  155. wordlists:
  156. List of dictionaries with subsition keys for each dataentry as values of the key
  157. Each dictionary represents a generalization level
  158. Returns:
  159. List of generalized data
  160. """
  161. ret = []
  162. if not isinstance(data, list):
  163. values = [data]
  164. else:
  165. values = data
  166. # Check if no more substitution is found
  167. if level > len(wordlists)-1:
  168. return l1sub(data, level)
  169. # Select right dictionary
  170. wordlist = wordlists[level]
  171. for value in values:
  172. # Search for value in dictionary
  173. for k, v in wordlist.items():
  174. if value in v:
  175. ret.append(k)
  176. return ret
  177. if __name__ == '__main__':
  178. print("This is a module!")
  179. exit(1)