read_adult_data.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import pickle
  4. from functools import cmp_to_key
  5. from basic_mondrian.models.gentree import GenTree
  6. from basic_mondrian.models.numrange import NumRange
  7. from basic_mondrian.utils.utility import cmp_str
  8. __DEBUG = False
  9. def read_data(path, dataset, ATT_NAMES, QI_INDEX, IS_CAT, SA_INDEX):
  10. """
  11. read microda for *.txt and return read data
  12. """
  13. QI_num = len(QI_INDEX)
  14. data = []
  15. numeric_dict = []
  16. delimiter = ';'
  17. for i in range(QI_num):
  18. numeric_dict.append(dict())
  19. # or categorical attributes in intuitive order
  20. # here, we use the appear number
  21. with open(os.path.join(path, dataset + '.csv')) as data_file:
  22. next(data_file)
  23. for line in data_file:
  24. line = line.strip()
  25. # remove double spaces
  26. line = line.replace(' ', '')
  27. temp = line.split(delimiter)
  28. ltemp = []
  29. for i in range(QI_num):
  30. index = QI_INDEX[i]
  31. if IS_CAT[i] is False:
  32. try:
  33. numeric_dict[i][temp[index]] += 1
  34. except KeyError:
  35. numeric_dict[i][temp[index]] = 1
  36. ltemp.append(temp[index])
  37. for i in SA_INDEX:
  38. ltemp.append(temp[i])
  39. data.append(ltemp)
  40. # pickle numeric attributes and get NumRange
  41. for i in range(QI_num):
  42. if IS_CAT[i] is False:
  43. with open(os.path.join(path, dataset + '_' + ATT_NAMES[QI_INDEX[i]] + '_static.pickle'), 'wb') as static_file:
  44. sort_value = list(numeric_dict[i].keys())
  45. sort_value.sort(key=cmp_to_key(cmp_str))
  46. pickle.dump((numeric_dict[i], sort_value), static_file)
  47. return data
  48. def read_tree(path, numeric_path, dataset, ATT_NAMES, QI_INDEX, IS_CAT):
  49. """read tree from data/tree_*.txt, store them in att_tree
  50. """
  51. att_names = []
  52. att_trees = []
  53. for t in QI_INDEX:
  54. att_names.append(ATT_NAMES[t])
  55. for i in range(len(att_names)):
  56. if IS_CAT[i]:
  57. att_trees.append(read_tree_file(path, dataset, att_names[i]))
  58. else:
  59. att_trees.append(read_pickle_file(numeric_path, dataset, att_names[i]))
  60. return att_trees
  61. def read_pickle_file(path, dataset, att_name):
  62. """
  63. read pickle file for numeric attributes
  64. return numrange object
  65. """
  66. try:
  67. with open(os.path.join(path, dataset + '_' + att_name + '_static.pickle'), 'rb') as static_file:
  68. numeric_dict, sort_value = pickle.load(static_file)
  69. except OSError:
  70. print("Pickle file not exists!!")
  71. print(os.path.join(path, dataset + '_' + att_name + '_static.pickle'))
  72. exit(2)
  73. result = NumRange(sort_value, numeric_dict)
  74. return result
  75. def read_tree_file(path, dataset, treename):
  76. """read tree data from treename
  77. """
  78. att_tree = {}
  79. prefix = os.path.join(path, dataset + '_hierarchy_')
  80. postfix = ".csv"
  81. with open(prefix + treename + postfix) as treefile:
  82. att_tree['*'] = GenTree('*')
  83. if __DEBUG:
  84. print("Reading Tree" + treename)
  85. for line in treefile:
  86. # delete \n
  87. if len(line) <= 1:
  88. break
  89. line = line.strip()
  90. temp = line.split(';')
  91. # copy temp
  92. temp.reverse()
  93. for i, t in enumerate(temp):
  94. isleaf = False
  95. if i == len(temp) - 1:
  96. isleaf = True
  97. # try and except is more efficient than 'in'
  98. try:
  99. att_tree[t]
  100. except KeyError:
  101. att_tree[t] = GenTree(t, att_tree[temp[i - 1]], isleaf)
  102. if __DEBUG:
  103. print("Nodes No. = %d" % att_tree['*'].support)
  104. return att_tree