# # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset, Dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import pandas as pd # import random # from collections import OrderedDict # import flwr as fl # DEVICE = torch.device("cpu") # def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # train_datasets = [] # test_datasets = [] # for _ in range(num_clients): # train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) # test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) # train_datasets.append(train_dataset) # test_datasets.append(test_dataset) # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # return train_datasets, test_datasets, data_collator # def read_log_file(): # with open("./log.txt", "r") as file: # return file.read() # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # net.eval() # loss = 0 # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # class CustomClient(fl.client.NumPyClient): # def __init__(self, net, trainloader, testloader, client_id): # self.net = net # self.trainloader = trainloader # self.testloader = testloader # self.client_id = client_id # self.losses = [] # self.accuracies = [] # def get_parameters(self, config): # return [val.cpu().numpy() for _, val in self.net.state_dict().items()] # def set_parameters(self, parameters): # params_dict = zip(self.net.state_dict().keys(), parameters) # state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) # self.net.load_state_dict(state_dict, strict=True) # def fit(self, parameters, config): # self.set_parameters(parameters) # train(self.net, self.trainloader, epochs=1) # loss, accuracy = test(self.net, self.testloader) # self.losses.append(loss) # self.accuracies.append(accuracy) # return self.get_parameters(config={}), len(self.trainloader.dataset), {} # def evaluate(self, parameters, config): # self.set_parameters(parameters) # loss, accuracy = test(self.net, self.testloader) # return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} # def plot_metrics(self, round_num, plot_placeholder): # if self.losses and self.accuracies: # plot_placeholder.write(f"#### Client {self.client_id} Metrics for Round {round_num}") # plot_placeholder.write(f"Loss: {self.losses[-1]:.4f}") # plot_placeholder.write(f"Accuracy: {self.accuracies[-1]:.4f}") # fig, ax1 = plt.subplots() # color = 'tab:red' # ax1.set_xlabel('Round') # ax1.set_ylabel('Loss', color=color) # ax1.plot(range(1, len(self.losses) + 1), self.losses, color=color) # ax1.tick_params(axis='y', labelcolor=color) # ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis # color = 'tab:blue' # ax2.set_ylabel('Accuracy', color=color) # ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, color=color) # ax2.tick_params(axis='y', labelcolor=color) # fig.tight_layout() # plot_placeholder.pyplot(fig) # def main(): # st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") # dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased","facebook/hubert-base-ls960", "distilbert-base-uncased"]) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) # trainloaders = [] # testloaders = [] # clients = [] # for i in range(NUM_CLIENTS): # st.write(f"### Client {i+1} Datasets") # train_df = pd.DataFrame(train_datasets[i]) # test_df = pd.DataFrame(test_datasets[i]) # st.write("#### Train Dataset") # edited_train_df = st.data_editor(train_df, key=f"train_{i}") # st.write("#### Test Dataset") # edited_test_df = st.data_editor(test_df, key=f"test_{i}") # edited_train_dataset = Dataset.from_pandas(edited_train_df) # edited_test_dataset = Dataset.from_pandas(edited_test_df) # trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) # trainloaders.append(trainloader) # testloaders.append(testloader) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # client = CustomClient(net, trainloader, testloader, client_id=i+1) # clients.append(client) # if st.button("Start Training"): # def client_fn(cid): # return clients[int(cid)] # def weighted_average(metrics): # accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] # losses = [num_examples * m["loss"] for num_examples, m in metrics] # examples = [num_examples for num_examples, _ in metrics] # return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} # strategy = fl.server.strategy.FedAvg( # fraction_fit=1.0, # fraction_evaluate=1.0, # evaluate_metrics_aggregation_fn=weighted_average, # ) # for round_num in range(NUM_ROUNDS): # st.write(f"### Round {round_num + 1}") # plot_placeholders = [st.empty() for _ in range(NUM_CLIENTS)] # fl.common.logger.configure(identifier="myFlowerExperiment", filename="./log.txt") # fl.simulation.start_simulation( # client_fn=client_fn, # num_clients=NUM_CLIENTS, # config=fl.server.ServerConfig(num_rounds=1), # strategy=strategy, # client_resources={"num_cpus": 1, "num_gpus": 0}, # ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} # ) # for i, client in enumerate(clients): # st.markdown("LOGS : "+ read_log_file()) # client.plot_metrics(round_num + 1, plot_placeholders[i]) # st.write(" ") # st.success("Training completed successfully!") # # Display final metrics # st.write("## Final Client Metrics") # for client in clients: # st.write(f"### Client {client.client_id}") # st.write(f"Final Loss: {client.losses[-1]:.4f}") # st.write(f"Final Accuracy: {client.accuracies[-1]:.4f}") # client.plot_metrics(NUM_ROUNDS, st.empty()) # st.write(" ") # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() # %%writefile app.py import streamlit as st import matplotlib.pyplot as plt import torch from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW from datasets import load_dataset, Dataset from evaluate import load as load_metric from torch.utils.data import DataLoader import pandas as pd import random from collections import OrderedDict import flwr as fl from logging import INFO, DEBUG from flwr.common.logger import log DEVICE = torch.device("cpu") def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): raw_datasets = load_dataset(dataset_name) raw_datasets = raw_datasets.shuffle(seed=42) del raw_datasets["unsupervised"] tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def tokenize_function(examples): return tokenizer(examples["text"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns("text") tokenized_datasets = tokenized_datasets.rename_column("label", "labels") train_datasets = [] test_datasets = [] for _ in range(num_clients): train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) train_datasets.append(train_dataset) test_datasets.append(test_dataset) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) return train_datasets, test_datasets, data_collator, raw_datasets def train(net, trainloader, epochs): optimizer = AdamW(net.parameters(), lr=5e-5) net.train() for _ in range(epochs): for batch in trainloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} outputs = net(**batch) loss = outputs.loss loss.backward() optimizer.step() optimizer.zero_grad() def test(net, testloader