|
@@ -1,186 +0,0 @@
|
|
|
-#!/usr/bin/env python
|
|
|
-# -*- coding: utf-8 -*-
|
|
|
-# Python version: 3.6
|
|
|
-
|
|
|
-
|
|
|
-import numpy as np
|
|
|
-from torchvision import datasets, transforms
|
|
|
-
|
|
|
-
|
|
|
-def mnist_iid(dataset, num_users):
|
|
|
- """
|
|
|
- Sample I.I.D. client data from MNIST dataset
|
|
|
- :param dataset:
|
|
|
- :param num_users:
|
|
|
- :return: dict of image index
|
|
|
- """
|
|
|
- num_items = int(len(dataset)/num_users)
|
|
|
- dict_users, all_idxs = {}, [i for i in range(len(dataset))]
|
|
|
- for i in range(num_users):
|
|
|
- dict_users[i] = set(np.random.choice(all_idxs, num_items, replace=False))
|
|
|
- all_idxs = list(set(all_idxs) - dict_users[i])
|
|
|
- return dict_users
|
|
|
-
|
|
|
-
|
|
|
-def mnist_noniid(dataset, num_users):
|
|
|
- """
|
|
|
- Sample non-I.I.D client data from MNIST dataset
|
|
|
- :param dataset:
|
|
|
- :param num_users:
|
|
|
- :return:
|
|
|
- """
|
|
|
- # 60,000 training imgs --> 200 imgs/shard X 300 shards
|
|
|
- num_shards, num_imgs = 200, 300
|
|
|
- idx_shard = [i for i in range(num_shards)]
|
|
|
- dict_users = {i: np.array([]) for i in range(num_users)}
|
|
|
- idxs = np.arange(num_shards*num_imgs)
|
|
|
- labels = dataset.train_labels.numpy()
|
|
|
-
|
|
|
- # sort labels
|
|
|
- idxs_labels = np.vstack((idxs, labels))
|
|
|
- idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
|
|
|
- idxs = idxs_labels[0, :]
|
|
|
-
|
|
|
- # divide and assign 2 shards/client
|
|
|
- for i in range(num_users):
|
|
|
- rand_set = set(np.random.choice(idx_shard, 2, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[i] = np.concatenate(
|
|
|
- (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
- return dict_users
|
|
|
-
|
|
|
-
|
|
|
-def mnist_noniid_unequal(dataset, num_users):
|
|
|
- """
|
|
|
- Sample non-I.I.D client data from MNIST dataset s.t clients
|
|
|
- have unequal amount of data
|
|
|
- :param dataset:
|
|
|
- :param num_users:
|
|
|
- :returns a dict of clients with each clients assigned certain
|
|
|
- number of training imgs
|
|
|
- """
|
|
|
- # 60,000 training imgs --> 50 imgs/shard X 1200 shards
|
|
|
- num_shards, num_imgs = 1200, 50
|
|
|
- idx_shard = [i for i in range(num_shards)]
|
|
|
- dict_users = {i: np.array([]) for i in range(num_users)}
|
|
|
- idxs = np.arange(num_shards*num_imgs)
|
|
|
- labels = dataset.train_labels.numpy()
|
|
|
-
|
|
|
- # sort labels
|
|
|
- idxs_labels = np.vstack((idxs, labels))
|
|
|
- idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
|
|
|
- idxs = idxs_labels[0, :]
|
|
|
-
|
|
|
- # Minimum and maximum shards assigned per client:
|
|
|
- min_shard = 1
|
|
|
- max_shard = 30
|
|
|
-
|
|
|
- # Divide the shards into random chunks for every client
|
|
|
- # s.t the sum of these chunks = num_shards
|
|
|
- random_shard_size = np.random.randint(min_shard, max_shard+1, size=num_users)
|
|
|
- random_shard_size = np.around(random_shard_size/sum(random_shard_size) * num_shards)
|
|
|
- random_shard_size = random_shard_size.astype(int)
|
|
|
-
|
|
|
- # Assign the shards randomly to each client
|
|
|
- if sum(random_shard_size) > num_shards:
|
|
|
-
|
|
|
- for i in range(num_users):
|
|
|
- # First assign each client 1 shard to ensure every client has
|
|
|
- # atleast one shard of data
|
|
|
- rand_set = set(np.random.choice(idx_shard, 1, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[i] = np.concatenate(
|
|
|
- (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
-
|
|
|
- random_shard_size = random_shard_size-1
|
|
|
-
|
|
|
- # Next, randomly assign the remaining shards
|
|
|
- for i in range(num_users):
|
|
|
- if len(idx_shard) == 0:
|
|
|
- continue
|
|
|
- shard_size = random_shard_size[i]
|
|
|
- if shard_size > len(idx_shard):
|
|
|
- shard_size = len(idx_shard)
|
|
|
- rand_set = set(np.random.choice(idx_shard, shard_size, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[i] = np.concatenate(
|
|
|
- (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
- else:
|
|
|
-
|
|
|
- for i in range(num_users):
|
|
|
- shard_size = random_shard_size[i]
|
|
|
- rand_set = set(np.random.choice(idx_shard, shard_size, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[i] = np.concatenate(
|
|
|
- (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
-
|
|
|
- if len(idx_shard) > 0:
|
|
|
- # Add the leftover shards to the client with minimum images:
|
|
|
- shard_size = len(idx_shard)
|
|
|
- # Add the remaining shard to the client with lowest data
|
|
|
- k = min(dict_users, key=lambda x: len(dict_users.get(x)))
|
|
|
- rand_set = set(np.random.choice(idx_shard, shard_size, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[k] = np.concatenate(
|
|
|
- (dict_users[k], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
-
|
|
|
- return dict_users
|
|
|
-
|
|
|
-
|
|
|
-def cifar_iid(dataset, num_users):
|
|
|
- """
|
|
|
- Sample I.I.D. client data from CIFAR10 dataset
|
|
|
- :param dataset:
|
|
|
- :param num_users:
|
|
|
- :return: dict of image index
|
|
|
- """
|
|
|
- num_items = int(len(dataset)/num_users)
|
|
|
- dict_users, all_idxs = {}, [i for i in range(len(dataset))]
|
|
|
- for i in range(num_users):
|
|
|
- dict_users[i] = set(np.random.choice(all_idxs, num_items, replace=False))
|
|
|
- all_idxs = list(set(all_idxs) - dict_users[i])
|
|
|
- return dict_users
|
|
|
-
|
|
|
-
|
|
|
-def cifar_noniid(dataset, num_users):
|
|
|
- """
|
|
|
- Sample non-I.I.D client data from CIFAR10 dataset
|
|
|
- :param dataset:
|
|
|
- :param num_users:
|
|
|
- :return:
|
|
|
- """
|
|
|
- num_shards, num_imgs = 200, 250
|
|
|
- idx_shard = [i for i in range(num_shards)]
|
|
|
- dict_users = {i: np.array([]) for i in range(num_users)}
|
|
|
- idxs = np.arange(num_shards*num_imgs)
|
|
|
- # labels = dataset.train_labels.numpy()
|
|
|
- labels = np.array(dataset.train_labels)
|
|
|
-
|
|
|
- # sort labels
|
|
|
- idxs_labels = np.vstack((idxs, labels))
|
|
|
- idxs_labels = idxs_labels[:, idxs_labels[1, :].argsort()]
|
|
|
- idxs = idxs_labels[0, :]
|
|
|
-
|
|
|
- # divide and assign
|
|
|
- for i in range(num_users):
|
|
|
- rand_set = set(np.random.choice(idx_shard, 2, replace=False))
|
|
|
- idx_shard = list(set(idx_shard) - rand_set)
|
|
|
- for rand in rand_set:
|
|
|
- dict_users[i] = np.concatenate(
|
|
|
- (dict_users[i], idxs[rand*num_imgs:(rand+1)*num_imgs]), axis=0)
|
|
|
- return dict_users
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == '__main__':
|
|
|
- dataset_train = datasets.MNIST('./data/mnist/', train=True, download=True,
|
|
|
- transform=transforms.Compose([
|
|
|
- transforms.ToTensor(),
|
|
|
- transforms.Normalize((0.1307,), (0.3081,))
|
|
|
- ]))
|
|
|
- num = 100
|
|
|
- d = mnist_noniid(dataset_train, num)
|