Browse Source

fix bug and update readme

wesleyjtann 5 years ago
parent
commit
06e8f098a6

+ 8 - 6
README.md

@@ -27,11 +27,11 @@ The baseline experiment trains the model in the conventional way.
 
 
 * To run the baseline experiment with MNIST on MLP using CPU:
 * To run the baseline experiment with MNIST on MLP using CPU:
 ```
 ```
-python baseline_main.py --model=mlp --dataset=mnist --gpu=None --epochs=10
+python baseline_main.py --model=mlp --dataset=mnist --epochs=10
 ```
 ```
 * Or to run it on GPU (eg: if gpu:0 is available):
 * Or to run it on GPU (eg: if gpu:0 is available):
 ```
 ```
-python baseline_main.py --model=mlp --dataset=mnist --gpu=0 --epochs=10
+python baseline_main.py --model=mlp --dataset=mnist --gpu=1 --epochs=10
 ```
 ```
 -----
 -----
 
 
@@ -39,11 +39,11 @@ Federated experiment involves training a global model using many local models.
 
 
 * To run the federated experiment with CIFAR on CNN (IID):
 * To run the federated experiment with CIFAR on CNN (IID):
 ```
 ```
-python federated_main.py --model=cnn --dataset=cifar --gpu=0 --iid=1 --epochs=10
+python federated_main.py --local_ep=1 --local_bs=10 --frac=0.1 --model=cnn --dataset=cifar --iid=1 --test_acc=99 --gpu=1
 ```
 ```
 * To run the same experiment under non-IID condition:
 * To run the same experiment under non-IID condition:
 ```
 ```
-python federated_main.py --model=cnn --dataset=cifar --gpu=0 --iid=0 --epochs=10
+python federated_main.py --local_ep=1 --local_bs=10 --frac=0.1 --model=cnn --dataset=cifar --iid=0 --test_acc=99 --gpu=1
 ```
 ```
 -----
 -----
 
 
@@ -51,11 +51,11 @@ Hierarchical Federated experiments involve training a global model using differe
 
 
 * To run the hierarchical federated experiment with MNIST on MLP (IID):
 * To run the hierarchical federated experiment with MNIST on MLP (IID):
 ```
 ```
-python federated-hierarchical_main.py --model=mlp --dataset=mnist --iid=1 --epochs=10 --local_ep=3
+python federated-hierarchical_main.py --local_ep=1 --local_bs=10 --frac=0.1 --Cepochs=5 --model=mlp --dataset=mnist --iid=1 --num_cluster=2 --test_acc=97  --gpu=1
 ```
 ```
 * To run the same experiment under non-IID condition:
 * To run the same experiment under non-IID condition:
 ```
 ```
-python federated-hierarchical_main.py --model=mlp --dataset=mnist --iid=0 --epochs=10 --local_ep=3
+python federated-hierarchical_main.py --local_ep=1 --local_bs=10 --frac=0.1 --Cepochs=5 --model=mlp --dataset=mnist --iid=0 --num_cluster=2 --test_acc=97  --gpu=1
 ```
 ```
 
 
 You can change the default values of other parameters to simulate different conditions. Refer to the options section.
 You can change the default values of other parameters to simulate different conditions. Refer to the options section.
@@ -78,6 +78,8 @@ The default values for various paramters parsed to the experiment are given in `
 * ```--local_ep:``` Number of local training epochs in each user. Default is 10.
 * ```--local_ep:``` Number of local training epochs in each user. Default is 10.
 * ```--local_bs:``` Batch size of local updates in each user. Default is 10.
 * ```--local_bs:``` Batch size of local updates in each user. Default is 10.
 * ```--unequal:```  Used in non-iid setting. Option to split the data amongst users equally or unequally. Default set to 0 for equal splits. Set to 1 for unequal splits.
 * ```--unequal:```  Used in non-iid setting. Option to split the data amongst users equally or unequally. Default set to 0 for equal splits. Set to 1 for unequal splits.
+* ```--num_clusters:```  Number of clusters in the hierarchy.
+* ```--Cepochs:```  Number of rounds of training in each cluster.
 
 
 ## Results on MNIST
 ## Results on MNIST
 #### Baseline Experiment:
 #### Baseline Experiment:

BIN
save/objects/FL1_mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
save/objects/FL2_mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
save/objects/mnist_mlp_5_C[0.1]_iid[1]_E[1]_B[10].pkl


BIN
src/__pycache__/options.cpython-37.pyc


+ 41 - 14
src/federated-hierarchical_main.py

@@ -36,7 +36,7 @@ def build_model(args, train_dataset):
         len_in = 1
         len_in = 1
         for x in img_size:
         for x in img_size:
             len_in *= x
             len_in *= x
-            global_model = MLP(dim_in=len_in, dim_hidden=200,
+            global_model = MLP(dim_in=len_in, dim_hidden=64,
                                dim_out=args.num_classes)
                                dim_out=args.num_classes)
     else:
     else:
         exit('Error: unrecognized model')
         exit('Error: unrecognized model')
@@ -67,7 +67,7 @@ def fl_train(args, train_dataset, cluster_global_model, cluster, usergrp, epochs
             cluster_w, cluster_loss = cluster_local_model.update_weights(model=copy.deepcopy(cluster_global_model), global_round=epoch)
             cluster_w, cluster_loss = cluster_local_model.update_weights(model=copy.deepcopy(cluster_global_model), global_round=epoch)
             cluster_local_weights.append(copy.deepcopy(cluster_w))
             cluster_local_weights.append(copy.deepcopy(cluster_w))
             cluster_local_losses.append(copy.deepcopy(cluster_loss))
             cluster_local_losses.append(copy.deepcopy(cluster_loss))
-            print('| Global Round : {} | User : {} | \tLoss: {:.6f}'.format(epoch, idx, cluster_loss))
+            # print('| Global Round : {} | User : {} | \tLoss: {:.6f}'.format(epoch, idx, cluster_loss))
 
 
         # averaging global weights
         # averaging global weights
         cluster_global_weights = average_weights(cluster_local_weights)
         cluster_global_weights = average_weights(cluster_local_weights)
@@ -113,16 +113,23 @@ if __name__ == '__main__':
     B1 = np.arange(cluster_size, cluster_size+cluster_size, dtype=int)
     B1 = np.arange(cluster_size, cluster_size+cluster_size, dtype=int)
     user_groupsB = {k:user_groups[k] for k in B1 if k in user_groups}
     user_groupsB = {k:user_groups[k] for k in B1 if k in user_groups}
     print("Size of cluster 2: ", len(user_groupsB))
     print("Size of cluster 2: ", len(user_groupsB))
+    # Cluster 3
+    C1 = np.arange(2*cluster_size, 3*cluster_size, dtype=int)
+    user_groupsC = {k:user_groups[k] for k in C1 if k in user_groups}
+    print("Size of cluster 3: ", len(user_groupsC))
+    # Cluster 4
+    D1 = np.arange(3*cluster_size, 4*cluster_size, dtype=int)
+    user_groupsD = {k:user_groups[k] for k in D1 if k in user_groups}
+    print("Size of cluster 4: ", len(user_groupsD))
 
 
     # MODEL PARAM SUMMARY
     # MODEL PARAM SUMMARY
     global_model = build_model(args, train_dataset)
     global_model = build_model(args, train_dataset)
     pytorch_total_params = sum(p.numel() for p in global_model.parameters())
     pytorch_total_params = sum(p.numel() for p in global_model.parameters())
-    print(pytorch_total_params)
+    print("Model total number of parameters: ", pytorch_total_params)
 
 
-    from torchsummary import summary
-
-    summary(global_model, (1, 28, 28))
-    global_model.parameters()
+    # from torchsummary import summary
+    # summary(global_model, (1, 28, 28))
+    # global_model.parameters()
 
 
     # Set the model to train and send it to device.
     # Set the model to train and send it to device.
     global_model.to(device)
     global_model.to(device)
@@ -134,18 +141,31 @@ if __name__ == '__main__':
 
 
 
 
     # ======= Set the cluster models to train and send it to device. =======
     # ======= Set the cluster models to train and send it to device. =======
+    # Cluster A
     cluster_modelA = build_model(args, train_dataset)
     cluster_modelA = build_model(args, train_dataset)
     cluster_modelA.to(device)
     cluster_modelA.to(device)
     cluster_modelA.train()
     cluster_modelA.train()
     # copy weights
     # copy weights
     cluster_modelA_weights = cluster_modelA.state_dict()
     cluster_modelA_weights = cluster_modelA.state_dict()
-
-    # Set the cluster models to train and send it to device.
+    # Cluster B
     cluster_modelB = build_model(args, train_dataset)
     cluster_modelB = build_model(args, train_dataset)
     cluster_modelB.to(device)
     cluster_modelB.to(device)
     cluster_modelB.train()
     cluster_modelB.train()
     # copy weights
     # copy weights
-    cluster_modelB_weights = cluster_modelA.state_dict()
+    cluster_modelB_weights = cluster_modelB.state_dict()
+    # Cluster C
+    cluster_modelC = build_model(args, train_dataset)
+    cluster_modelC.to(device)
+    cluster_modelC.train()
+    # copy weights
+    cluster_modelC_weights = cluster_modelC.state_dict()
+    # Cluster D
+    cluster_modelD = build_model(args, train_dataset)
+    cluster_modelD.to(device)
+    cluster_modelD.train()
+    # copy weights
+    cluster_modelD_weights = cluster_modelD.state_dict()
+
 
 
     train_loss, train_accuracy = [], []
     train_loss, train_accuracy = [], []
     val_acc_list, net_list = [], []
     val_acc_list, net_list = [], []
@@ -163,14 +183,21 @@ if __name__ == '__main__':
         global_model.train()
         global_model.train()
         
         
         # Cluster A
         # Cluster A
-        A_weights, A_losses = fl_train(args, train_dataset, cluster_modelA, A1, user_groupsA, args.epochs)
+        A_weights, A_losses = fl_train(args, train_dataset, cluster_modelA, A1, user_groupsA, args.Cepochs)
         local_weights.append(copy.deepcopy(A_weights))
         local_weights.append(copy.deepcopy(A_weights))
-        local_losses.append(copy.deepcopy(A_losses))
-        
+        local_losses.append(copy.deepcopy(A_losses))        
         # Cluster B
         # Cluster B
-        B_weights, B_losses = fl_train(args, train_dataset, cluster_modelB, B1, user_groupsB, args.epochs)
+        B_weights, B_losses = fl_train(args, train_dataset, cluster_modelB, B1, user_groupsB, args.Cepochs)
         local_weights.append(copy.deepcopy(B_weights))
         local_weights.append(copy.deepcopy(B_weights))
         local_losses.append(copy.deepcopy(B_losses))
         local_losses.append(copy.deepcopy(B_losses))
+        # Cluster C
+        C_weights, C_losses = fl_train(args, train_dataset, cluster_modelC, C1, user_groupsC, args.Cepochs)
+        local_weights.append(copy.deepcopy(C_weights))
+        local_losses.append(copy.deepcopy(C_losses))        
+        # Cluster D
+        D_weights, D_losses = fl_train(args, train_dataset, cluster_modelD, D1, user_groupsD, args.Cepochs)
+        local_weights.append(copy.deepcopy(D_weights))
+        local_losses.append(copy.deepcopy(D_losses))
         
         
         
         
         # averaging global weights
         # averaging global weights

+ 15 - 4
src/federated_main.py

@@ -61,6 +61,11 @@ if __name__ == '__main__':
     global_model.to(device)
     global_model.to(device)
     global_model.train()
     global_model.train()
     print(global_model)
     print(global_model)
+    
+    # MODEL PARAM SUMMARY
+    pytorch_total_params = sum(p.numel() for p in global_model.parameters())
+    print("Model total number of parameters: ", pytorch_total_params)
+    # print(global_model.parameters())
 
 
     # copy weights
     # copy weights
     global_weights = global_model.state_dict()
     global_weights = global_model.state_dict()
@@ -69,10 +74,12 @@ if __name__ == '__main__':
     train_loss, train_accuracy = [], []
     train_loss, train_accuracy = [], []
     val_acc_list, net_list = [], []
     val_acc_list, net_list = [], []
     cv_loss, cv_acc = [], []
     cv_loss, cv_acc = [], []
-    print_every = 2
+    print_every = 1
     val_loss_pre, counter = 0, 0
     val_loss_pre, counter = 0, 0
+    testacc_check, epoch = 0, 0
 
 
-    for epoch in tqdm(range(args.epochs)):  # global training epochs
+    # for epoch in tqdm(range(args.epochs)):  # global training epochs
+    while testacc_check < args.test_acc:
         local_weights, local_losses = [], [] # init empty local weights and local losses
         local_weights, local_losses = [], [] # init empty local weights and local losses
         print(f'\n | Global Training Round : {epoch+1} |\n') # starting with | Global Training Round : 1 |
         print(f'\n | Global Training Round : {epoch+1} |\n') # starting with | Global Training Round : 1 |
 
 
@@ -110,7 +117,7 @@ if __name__ == '__main__':
 
 
         for c in range(args.num_users): # 0 to 99
         for c in range(args.num_users): # 0 to 99
             local_model = LocalUpdate(args=args, dataset=train_dataset,
             local_model = LocalUpdate(args=args, dataset=train_dataset,
-                                      # idxs=user_groups[idx], logger=logger)
+                                      idxs=user_groups[idx], logger=logger)
             # Fix error idxs=user_groups[idx] to idxs=user_groups[c]                                      
             # Fix error idxs=user_groups[idx] to idxs=user_groups[c]                                      
             local_model = LocalUpdate(args=args, dataset=train_dataset,
             local_model = LocalUpdate(args=args, dataset=train_dataset,
                                       idxs=user_groups[idx], logger=logger)
                                       idxs=user_groups[idx], logger=logger)
@@ -119,6 +126,10 @@ if __name__ == '__main__':
             list_loss.append(loss)
             list_loss.append(loss)
         train_accuracy.append(sum(list_acc)/len(list_acc)) # Performance measure
         train_accuracy.append(sum(list_acc)/len(list_acc)) # Performance measure
 
 
+        # Add
+        testacc_check = 100*train_accuracy[-1]
+        epoch = epoch + 1
+
         # print global training loss after every 'i' rounds
         # print global training loss after every 'i' rounds
         if (epoch+1) % print_every == 0: # If print_every=2, => print every 2 rounds
         if (epoch+1) % print_every == 0: # If print_every=2, => print every 2 rounds
             print(f' \nAvg Training Stats after {epoch+1} global rounds:')
             print(f' \nAvg Training Stats after {epoch+1} global rounds:')
@@ -128,7 +139,7 @@ if __name__ == '__main__':
     # Test inference after completion of training
     # Test inference after completion of training
     test_acc, test_loss = test_inference(args, global_model, test_dataset)
     test_acc, test_loss = test_inference(args, global_model, test_dataset)
 
 
-    print(f' \n Results after {args.epochs} global rounds of training:')
+    print(f' \n Results after {epoch} global rounds of training:')
     print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
     print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))
     print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))
     print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))
 
 

+ 3 - 2
src/options.py

@@ -15,7 +15,7 @@ def args_parser():
                         help="number of users: K")
                         help="number of users: K")
     parser.add_argument('--frac', type=float, default=0.1,
     parser.add_argument('--frac', type=float, default=0.1,
                         help='the fraction of clients: C')
                         help='the fraction of clients: C')
-    parser.add_argument('--local_ep', type=int, default=10,
+    parser.add_argument('--local_ep', type=int, default=1,
                         help="the number of local epochs: E")
                         help="the number of local epochs: E")
     parser.add_argument('--local_bs', type=int, default=10,
     parser.add_argument('--local_bs', type=int, default=10,
                         help="local batch size: B")
                         help="local batch size: B")
@@ -47,7 +47,7 @@ def args_parser():
                         of dataset")
                         of dataset")
     parser.add_argument('--num_classes', type=int, default=10, help="number \
     parser.add_argument('--num_classes', type=int, default=10, help="number \
                         of classes")
                         of classes")
-    parser.add_argument('--gpu', default=None, help="To use cuda, set \
+    parser.add_argument('--gpu', type=int, default=0, help="To use cuda, set \
                         to a specific GPU ID. Default set to use CPU.")
                         to a specific GPU ID. Default set to use CPU.")
     parser.add_argument('--optimizer', type=str, default='sgd', help="type \
     parser.add_argument('--optimizer', type=str, default='sgd', help="type \
                         of optimizer")
                         of optimizer")
@@ -64,6 +64,7 @@ def args_parser():
     # Add arguments
     # Add arguments
     parser.add_argument('--num_clusters', type=int, default=2, help='the number of clusters')
     parser.add_argument('--num_clusters', type=int, default=2, help='the number of clusters')
     parser.add_argument('--test_acc', type=int, default=95, help='target test accuracy')
     parser.add_argument('--test_acc', type=int, default=95, help='target test accuracy')
+    parser.add_argument('--Cepochs', type=int, default=5,help="number of rounds of training in each cluster")
 
 
     args = parser.parse_args()
     args = parser.parse_args()
     return args
     return args