federated_main.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # Python version: 3.6
  4. import os
  5. import copy
  6. import time
  7. import pickle
  8. import numpy as np
  9. from tqdm import tqdm
  10. import torch
  11. from tensorboardX import SummaryWriter
  12. from options import args_parser
  13. from update import LocalUpdate, test_inference
  14. from models import MLP, CNNMnist, CNNFashion_Mnist, CNNCifar
  15. from utils import get_dataset, average_weights, exp_details, set_device
  16. if __name__ == '__main__':
  17. start_time = time.time()
  18. # define paths
  19. path_project = os.path.abspath('..')
  20. logger = SummaryWriter('../logs')
  21. args = args_parser()
  22. exp_details(args)
  23. # if args.gpu:
  24. # torch.cuda.set_device(args.gpu)
  25. # # torch.cuda.set_device(0)
  26. # device = 'cuda' if args.gpu else 'cpu'
  27. # Select CPU or GPU
  28. device = set_device(args)
  29. # load dataset and user groups
  30. train_dataset, test_dataset, user_groups = get_dataset(args)
  31. # BUILD MODEL
  32. if args.model == 'cnn':
  33. # Convolutional neural netork
  34. if args.dataset == 'mnist':
  35. global_model = CNNMnist(args=args)
  36. elif args.dataset == 'fmnist':
  37. global_model = CNNFashion_Mnist(args=args)
  38. elif args.dataset == 'cifar':
  39. global_model = CNNCifar(args=args)
  40. elif args.model == 'mlp':
  41. # Multi-layer preceptron
  42. img_size = train_dataset[0][0].shape
  43. len_in = 1
  44. for x in img_size:
  45. len_in *= x
  46. global_model = MLP(dim_in=len_in, dim_hidden=args.mlpdim,
  47. dim_out=args.num_classes)
  48. else:
  49. exit('Error: unrecognized model')
  50. # Set the model to train and send it to device.
  51. global_model.to(device)
  52. global_model.train()
  53. print(global_model)
  54. # MODEL PARAM SUMMARY
  55. pytorch_total_params = sum(p.numel() for p in global_model.parameters())
  56. print("Model total number of parameters: ", pytorch_total_params)
  57. # print(global_model.parameters())
  58. # copy weights
  59. global_weights = global_model.state_dict()
  60. # Training
  61. train_loss, train_accuracy = [], []
  62. val_acc_list, net_list = [], []
  63. cv_loss, cv_acc = [], []
  64. print_every = 1
  65. val_loss_pre, counter = 0, 0
  66. testacc_check, epoch = 0, 0
  67. # for epoch in tqdm(range(args.epochs)): # global training epochs
  68. for epoch in range(args.epochs):
  69. # while testacc_check < args.test_acc or epoch < args.epochs:
  70. # while testacc_check < args.test_acc:
  71. local_weights, local_losses = [], [] # init empty local weights and local losses
  72. print(f'\n | Global Training Round : {epoch+1} |\n') # starting with | Global Training Round : 1 |
  73. """
  74. model.train() tells your model that you are training the model. So effectively layers like dropout, batchnorm etc. which behave different on the train and test procedures know what is going on and hence can behave accordingly.
  75. More details: It sets the mode to train (see source code). You can call either model.eval() or model.train(mode=False) to tell that you are testing. It is somewhat intuitive to expect train function to train model but it does not do that. It just sets the mode.
  76. """
  77. # ============== TRAIN ==============
  78. global_model.train()
  79. m = max(int(args.frac * args.num_users), 1) # C = args.frac. Setting number of clients m for training
  80. idxs_users = np.random.choice(range(args.num_users), m, replace=False) # args.num_users=100 total clients. Choosing a random array of indices. Subset of clients.
  81. for idx in idxs_users: # For each client in the subset.
  82. local_model = LocalUpdate(args=args, dataset=train_dataset,
  83. idxs=user_groups[idx], logger=logger)
  84. w, loss = local_model.update_weights( # update_weights() contain multiple prints
  85. model=copy.deepcopy(global_model), global_round=epoch) # w = local model weights
  86. local_weights.append(copy.deepcopy(w))
  87. local_losses.append(copy.deepcopy(loss))
  88. # Averaging m local client weights
  89. global_weights = average_weights(local_weights)
  90. # update global weights
  91. global_model.load_state_dict(global_weights)
  92. loss_avg = sum(local_losses) / len(local_losses)
  93. train_loss.append(loss_avg) # Performance measure
  94. # ============== EVAL ==============
  95. # Calculate avg training accuracy over all users at every epoch
  96. list_acc, list_loss = [], []
  97. global_model.eval() # must set your model into evaluation mode when computing model output values if dropout or bach norm used for training.
  98. for c in range(args.num_users): # 0 to 99
  99. # local_model = LocalUpdate(args=args, dataset=train_dataset,
  100. # idxs=user_groups[idx], logger=logger)
  101. # Fix error idxs=user_groups[idx] to idxs=user_groups[c]
  102. local_model = LocalUpdate(args=args, dataset=train_dataset,
  103. idxs=user_groups[c], logger=logger)
  104. acc, loss = local_model.inference(model=global_model)
  105. list_acc.append(acc)
  106. list_loss.append(loss)
  107. train_accuracy.append(sum(list_acc)/len(list_acc)) # Performance measure
  108. # Add
  109. testacc_check = 100*train_accuracy[-1]
  110. epoch = epoch + 1
  111. # print global training loss after every 'i' rounds
  112. if (epoch+1) % print_every == 0: # If print_every=2, => print every 2 rounds
  113. print(f' \nAvg Training Stats after {epoch+1} global rounds:')
  114. print(f'Training Loss : {np.mean(np.array(train_loss))}')
  115. print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))
  116. # Test inference after completion of training
  117. test_acc, test_loss = test_inference(args, global_model, test_dataset)
  118. print(f' \n Results after {epoch} global rounds of training:')
  119. print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
  120. print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))
  121. # Saving the objects train_loss and train_accuracy:
  122. file_name = '../save/objects/FL_{}_{}_{}_lr[{}]_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
  123. format(args.dataset, args.model, epoch, args.lr, args.frac, args.iid,
  124. args.local_ep, args.local_bs)
  125. with open(file_name, 'wb') as f:
  126. pickle.dump([train_loss, train_accuracy], f)
  127. print('\n Total Run Time: {0:0.4f}'.format(time.time()-start_time))
  128. # PLOTTING (optional)
  129. # import matplotlib
  130. # import matplotlib.pyplot as plt
  131. # matplotlib.use('Agg')
  132. # Plot Loss curve
  133. # plt.figure()
  134. # plt.title('Training Loss vs Communication rounds')
  135. # plt.plot(range(len(train_loss)), train_loss, color='r')
  136. # plt.ylabel('Training loss')
  137. # plt.xlabel('Communication Rounds')
  138. # plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_loss.png'.
  139. # format(args.dataset, args.model, args.epochs, args.frac,
  140. # args.iid, args.local_ep, args.local_bs))
  141. #
  142. # # Plot Average Accuracy vs Communication rounds
  143. # plt.figure()
  144. # plt.title('Average Accuracy vs Communication rounds')
  145. # plt.plot(range(len(train_accuracy)), train_accuracy, color='k')
  146. # plt.ylabel('Average Accuracy')
  147. # plt.xlabel('Communication Rounds')
  148. # plt.savefig('../save/fed_{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}]_acc.png'.
  149. # format(args.dataset, args.model, args.epochs, args.frac,
  150. # args.iid, args.local_ep, args.local_bs))