Pārlūkot izejas kodu

fix bug and update readme

wesleyjtann 5 gadi atpakaļ
vecāks
revīzija
06e8f098a6

+ 8 - 6
README.md

@@ -27,11 +27,11 @@ The baseline experiment trains the model in the conventional way.
 
 * To run the baseline experiment with MNIST on MLP using CPU:
 ```
-python baseline_main.py --model=mlp --dataset=mnist --gpu=None --epochs=10
+python baseline_main.py --model=mlp --dataset=mnist --epochs=10
 ```
 * Or to run it on GPU (eg: if gpu:0 is available):
 ```
-python baseline_main.py --model=mlp --dataset=mnist --gpu=0 --epochs=10
+python baseline_main.py --model=mlp --dataset=mnist --gpu=1 --epochs=10
 ```
 -----
 
@@ -39,11 +39,11 @@ Federated experiment involves training a global model using many local models.
 
 * To run the federated experiment with CIFAR on CNN (IID):
 ```
-python federated_main.py --model=cnn --dataset=cifar --gpu=0 --iid=1 --epochs=10
+python federated_main.py --local_ep=1 --local_bs=10 --frac=0.1 --model=cnn --dataset=cifar --iid=1 --test_acc=99 --gpu=1
 ```
 * To run the same experiment under non-IID condition:
 ```
-python federated_main.py --model=cnn --dataset=cifar --gpu=0 --iid=0 --epochs=10
+python federated_main.py --local_ep=1 --local_bs=10 --frac=0.1 --model=cnn --dataset=cifar --iid=0 --test_acc=99 --gpu=1
 ```
 -----
 
@@ -51,11 +51,11 @@ Hierarchical Federated experiments involve training a global model using differe
 
 * To run the hierarchical federated experiment with MNIST on MLP (IID):
 ```
-python federated-hierarchical_main.py --model=mlp --dataset=mnist --iid=1 --epochs=10 --local_ep=3
+python federated-hierarchical_main.py --local_ep=1 --local_bs=10 --frac=0.1 --Cepochs=5 --model=mlp --dataset=mnist --iid=1 --num_cluster=2 --test_acc=97  --gpu=1
 ```
 * To run the same experiment under non-IID condition:
 ```
-python federated-hierarchical_main.py --model=mlp --dataset=mnist --iid=0 --epochs=10 --local_ep=3
+python federated-hierarchical_main.py --local_ep=1 --local_bs=10 --frac=0.1 --Cepochs=5 --model=mlp --dataset=mnist --iid=0 --num_cluster=2 --test_acc=97  --gpu=1
 ```
 
 You can change the default values of other parameters to simulate different conditions. Refer to the options section.
@@ -78,6 +78,8 @@ The default values for various paramters parsed to the experiment are given in `
 * ```--local_ep:``` Number of local training epochs in each user. Default is 10.
 * ```--local_bs:``` Batch size of local updates in each user. Default is 10.
 * ```--unequal:```  Used in non-iid setting. Option to split the data amongst users equally or unequally. Default set to 0 for equal splits. Set to 1 for unequal splits.
+* ```--num_clusters:```  Number of clusters in the hierarchy.
+* ```--Cepochs:```  Number of rounds of training in each cluster.
 
 ## Results on MNIST
 #### Baseline Experiment:

BIN
save/objects/FL1_mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
save/objects/FL2_mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
save/objects/mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
src/__pycache__/options.cpython-37.pyc


+ 41 - 14
src/federated-hierarchical_main.py

@@ -36,7 +36,7 @@ def build_model(args, train_dataset):
         len_in = 1
         for x in img_size:
             len_in *= x
-            global_model = MLP(dim_in=len_in, dim_hidden=200,
+            global_model = MLP(dim_in=len_in, dim_hidden=64,
                                dim_out=args.num_classes)
     else:
         exit('Error: unrecognized model')
@@ -67,7 +67,7 @@ def fl_train(args, train_dataset, cluster_global_model, cluster, usergrp, epochs
             cluster_w, cluster_loss = cluster_local_model.update_weights(model=copy.deepcopy(cluster_global_model), global_round=epoch)
             cluster_local_weights.append(copy.deepcopy(cluster_w))
             cluster_local_losses.append(copy.deepcopy(cluster_loss))
-            print('| Global Round : {} | User : {} | \tLoss: {:.6f}'.format(epoch, idx, cluster_loss))
+            # print('| Global Round : {} | User : {} | \tLoss: {:.6f}'.format(epoch, idx, cluster_loss))
 
         # averaging global weights
         cluster_global_weights = average_weights(cluster_local_weights)
@@ -113,16 +113,23 @@ if __name__ == '__main__':
     B1 = np.arange(cluster_size, cluster_size+cluster_size, dtype=int)
     user_groupsB = {k:user_groups[k] for k in B1 if k in user_groups}
     print("Size of cluster 2: ", len(user_groupsB))
+    # Cluster 3
+    C1 = np.arange(2*cluster_size, 3*cluster_size, dtype=int)
+    user_groupsC = {k:user_groups[k] for k in C1 if k in user_groups}
+    print("Size of cluster 3: ", len(user_groupsC))
+    # Cluster 4
+    D1 = np.arange(3*cluster_size, 4*cluster_size, dtype=int)
+    user_groupsD = {k:user_groups[k] for k in D1 if k in user_groups}
+    print("Size of cluster 4: ", len(user_groupsD))
 
     # MODEL PARAM SUMMARY
     global_model = build_model(args, train_dataset)
     pytorch_total_params = sum(p.numel() for p in global_model.parameters())
-    print(pytorch_total_params)
+    print("Model total number of parameters: ", pytorch_total_params)
 
-    from torchsummary import summary
-
-    summary(global_model, (1, 28, 28))
-    global_model.parameters()
+    # from torchsummary import summary
+    # summary(global_model, (1, 28, 28))
+    # global_model.parameters()
 
     # Set the model to train and send it to device.
     global_model.to(device)
@@ -134,18 +141,31 @@ if __name__ == '__main__':
 
 
     # ======= Set the cluster models to train and send it to device. =======
+    # Cluster A
     cluster_modelA = build_model(args, train_dataset)
     cluster_modelA.to(device)
     cluster_modelA.train()
     # copy weights
     cluster_modelA_weights = cluster_modelA.state_dict()
-
-    # Set the cluster models to train and send it to device.
+    # Cluster B
     cluster_modelB = build_model(args, train_dataset)
     cluster_modelB.to(device)
     cluster_modelB.train()
     # copy weights
-    cluster_modelB_weights = cluster_modelA.state_dict()
+    cluster_modelB_weights = cluster_modelB.state_dict()
+    # Cluster C
+    cluster_modelC = build_model(args, train_dataset)
+    cluster_modelC.to(device)
+    cluster_modelC.train()
+    # copy weights
+    cluster_modelC_weights = cluster_modelC.state_dict()
+    # Cluster D
+    cluster_modelD = build_model(args, train_dataset)
+    cluster_modelD.to(device)
+    cluster_modelD.train()
+    # copy weights
+    cluster_modelD_weights = cluster_modelD.state_dict()
+
 
     train_loss, train_accuracy = [], []
     val_acc_list, net_list = [], []
@@ -163,14 +183,21 @@ if __name__ == '__main__':
         global_model.train()
         
         # Cluster A
-        A_weights, A_losses = fl_train(args, train_dataset, cluster_modelA, A1, user_groupsA, args.epochs)
+        A_weights, A_losses = fl_train(args, train_dataset, cluster_modelA, A1, user_groupsA, args.Cepochs)
         local_weights.append(copy.deepcopy(A_weights))
-        local_losses.append(copy.deepcopy(A_losses))
-        
+        local_losses.append(copy.deepcopy(A_losses))        
         # Cluster B
-        B_weights, B_losses = fl_train(args, train_dataset, cluster_modelB, B1, user_groupsB, args.epochs)
+        B_weights, B_losses = fl_train(args, train_dataset, cluster_modelB, B1, user_groupsB, args.Cepochs)
         local_weights.append(copy.deepcopy(B_weights))
         local_losses.append(copy.deepcopy(B_losses))
+        # Cluster C
+        C_weights, C_losses = fl_train(args, train_dataset, cluster_modelC, C1, user_groupsC, args.Cepochs)
+        local_weights.append(copy.deepcopy(C_weights))
+        local_losses.append(copy.deepcopy(C_losses))        
+        # Cluster D
+        D_weights, D_losses = fl_train(args, train_dataset, cluster_modelD, D1, user_groupsD, args.Cepochs)
+        local_weights.append(copy.deepcopy(D_weights))
+        local_losses.append(copy.deepcopy(D_losses))
         
         
         # averaging global weights

+ 15 - 4
src/federated_main.py

@@ -61,6 +61,11 @@ if __name__ == '__main__':
     global_model.to(device)
     global_model.train()
     print(global_model)
+    
+    # MODEL PARAM SUMMARY
+    pytorch_total_params = sum(p.numel() for p in global_model.parameters())
+    print("Model total number of parameters: ", pytorch_total_params)
+    # print(global_model.parameters())
 
     # copy weights
     global_weights = global_model.state_dict()
@@ -69,10 +74,12 @@ if __name__ == '__main__':
     train_loss, train_accuracy = [], []
     val_acc_list, net_list = [], []
     cv_loss, cv_acc = [], []
-    print_every = 2
+    print_every = 1
     val_loss_pre, counter = 0, 0
+    testacc_check, epoch = 0, 0
 
-    for epoch in tqdm(range(args.epochs)):  # global training epochs
+    # for epoch in tqdm(range(args.epochs)):  # global training epochs
+    while testacc_check < args.test_acc:
         local_weights, local_losses = [], [] # init empty local weights and local losses
         print(f'\n | Global Training Round : {epoch+1} |\n') # starting with | Global Training Round : 1 |
 
@@ -110,7 +117,7 @@ if __name__ == '__main__':
 
         for c in range(args.num_users): # 0 to 99
             local_model = LocalUpdate(args=args, dataset=train_dataset,
-                                      # idxs=user_groups[idx], logger=logger)
+                                      idxs=user_groups[idx], logger=logger)
             # Fix error idxs=user_groups[idx] to idxs=user_groups[c]                                      
             local_model = LocalUpdate(args=args, dataset=train_dataset,
                                       idxs=user_groups[idx], logger=logger)
@@ -119,6 +126,10 @@ if __name__ == '__main__':
             list_loss.append(loss)
         train_accuracy.append(sum(list_acc)/len(list_acc)) # Performance measure
 
+        # Add
+        testacc_check = 100*train_accuracy[-1]
+        epoch = epoch + 1
+
         # print global training loss after every 'i' rounds
         if (epoch+1) % print_every == 0: # If print_every=2, => print every 2 rounds
             print(f' \nAvg Training Stats after {epoch+1} global rounds:')
@@ -128,7 +139,7 @@ if __name__ == '__main__':
     # Test inference after completion of training
     test_acc, test_loss = test_inference(args, global_model, test_dataset)
 
-    print(f' \n Results after {args.epochs} global rounds of training:')
+    print(f' \n Results after {epoch} global rounds of training:')
     print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
     print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))
 

+ 3 - 2
src/options.py

@@ -15,7 +15,7 @@ def args_parser():
                         help="number of users: K")
     parser.add_argument('--frac', type=float, default=0.1,
                         help='the fraction of clients: C')
-    parser.add_argument('--local_ep', type=int, default=10,
+    parser.add_argument('--local_ep', type=int, default=1,
                         help="the number of local epochs: E")
     parser.add_argument('--local_bs', type=int, default=10,
                         help="local batch size: B")
@@ -47,7 +47,7 @@ def args_parser():
                         of dataset")
     parser.add_argument('--num_classes', type=int, default=10, help="number \
                         of classes")
-    parser.add_argument('--gpu', default=None, help="To use cuda, set \
+    parser.add_argument('--gpu', type=int, default=0, help="To use cuda, set \
                         to a specific GPU ID. Default set to use CPU.")
     parser.add_argument('--optimizer', type=str, default='sgd', help="type \
                         of optimizer")
@@ -64,6 +64,7 @@ def args_parser():
     # Add arguments
     parser.add_argument('--num_clusters', type=int, default=2, help='the number of clusters')
     parser.add_argument('--test_acc', type=int, default=95, help='target test accuracy')
+    parser.add_argument('--Cepochs', type=int, default=5,help="number of rounds of training in each cluster")
 
     args = parser.parse_args()
     return args