data.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # -*- coding: utf-8 -*-
  2. import csv
  3. import os
  4. import pickle
  5. def read_raw_fromdf(df, numeric_path, dataset, qi_index, is_cat, sort_count=False):
  6. numeric_dict = [df.astype(str).iloc[:,idx].value_counts(ascending=True).to_dict() if not cat
  7. else {} for idx, cat in zip(qi_index, is_cat)]
  8. data = df.to_numpy().astype(str).tolist()
  9. header = df.columns.to_numpy().astype(str).tolist()
  10. for i, qii, cat in zip(range(len(qi_index)), qi_index, is_cat):
  11. if not cat:
  12. with open(os.path.join(numeric_path, dataset + '_' + header[qii] + '_static.pickle'),
  13. 'wb') as static_file:
  14. sort_value = None
  15. if sort_count:
  16. sort_value = [elem[0] for elem in
  17. sorted(numeric_dict[i].items(), key=lambda x: x[1])]
  18. else:
  19. sort_value = sorted(numeric_dict[i])
  20. pickle.dump((numeric_dict[i], sort_value), static_file)
  21. return (data,header)
  22. def read_raw(path, numeric_path, dataset, qi_index, is_cat, delimiter=';', sort_count=False):
  23. """Reads dataset from a csv file. Split in header and data
  24. :param file_path: Path to the csv file
  25. :param delimiter: Character that is used as delimiter in the csv file
  26. :return: 2d list: data[col][row], list: header[col]
  27. """
  28. numeric_dict = [{} for elem in qi_index]
  29. data = []
  30. with open(os.path.join(path, f'{dataset}.csv')) as csv_file:
  31. csv_reader = csv.reader(csv_file, delimiter=delimiter)
  32. header = next(csv_reader)
  33. for row in csv_reader:
  34. data.append(row)
  35. # save the count of each value
  36. for i, qii, cat in zip(range(len(qi_index)), qi_index, is_cat):
  37. if not cat:
  38. try:
  39. numeric_dict[i][row[qii]] += 1
  40. except KeyError:
  41. numeric_dict[i][row[qii]] = 1
  42. # sort non categorical value by count or value
  43. for i, qii, cat in zip(range(len(qi_index)), qi_index, is_cat):
  44. if not cat:
  45. with open(os.path.join(numeric_path, dataset + '_' + header[qii] + '_static.pickle'), 'wb') as static_file:
  46. sort_value = None
  47. if sort_count:
  48. sort_value = [elem[0] for elem in sorted(numeric_dict[i].items(), key=lambda x: x[1])]
  49. else:
  50. sort_value = sorted(numeric_dict[i])
  51. pickle.dump((numeric_dict[i], sort_value), static_file)
  52. return (data, header)
  53. def reorder_columns(data, qi_index):
  54. res = []
  55. for row in data:
  56. qi = [elem for i, elem in enumerate(row) if i in qi_index]
  57. non_qi = [elem for i, elem in enumerate(row) if i not in qi_index]
  58. res.append([*qi, *non_qi])
  59. return res
  60. def restore_column_order(data, qi_index):
  61. res = []
  62. for row in data:
  63. new_row = row[len(qi_index):]
  64. for i, elem in zip(qi_index, row[:len(qi_index)]):
  65. new_row.insert(i, elem)
  66. res.append(new_row)
  67. return res
  68. def transform_columns(data):
  69. res = [[] for _ in range(len(data[0]))]
  70. for row in data:
  71. print(row)
  72. for i, column in enumerate(row):
  73. print(column)
  74. res[i].append(column)
  75. return res
  76. def write_anon(path, anon_data, header, k, s, dataset, delimiter=';'):
  77. if isinstance(anon_data, dict):
  78. anon_data = anon_data.values()
  79. else:
  80. anon_data = [anon_data]
  81. for i, data in enumerate(anon_data):
  82. with open(os.path.join(path, dataset + "_anonymized_" + str(k) + '_' + str(i) + ".csv"), mode='w', newline='') as anon_file:
  83. anon_writer = csv.writer(anon_file, delimiter=delimiter)
  84. anon_writer.writerow(header)
  85. anon_writer.writerows(data)
  86. return len(anon_data)