# # %%writefile app.py | |
# import streamlit as st | |
# import matplotlib.pyplot as plt | |
# import torch | |
# from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW | |
# from datasets import load_dataset, Dataset | |
# from evaluate import load as load_metric | |
# from torch.utils.data import DataLoader | |
# import pandas as pd | |
# import random | |
# from collections import OrderedDict | |
# import flwr as fl | |
# DEVICE = torch.device("cpu") | |
# def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): | |
# raw_datasets = load_dataset(dataset_name) | |
# raw_datasets = raw_datasets.shuffle(seed=42) | |
# del raw_datasets["unsupervised"] | |
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
# def tokenize_function(examples): | |
# return tokenizer(examples["text"], truncation=True) | |
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
# tokenized_datasets = tokenized_datasets.remove_columns("text") | |
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels") | |
# train_datasets = [] | |
# test_datasets = [] | |
# for _ in range(num_clients): | |
# train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) | |
# test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) | |
# train_datasets.append(train_dataset) | |
# test_datasets.append(test_dataset) | |
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# return train_datasets, test_datasets, data_collator | |
# def read_log_file(): | |
# with open("./log.txt", "r") as file: | |
# return file.read() | |
# def train(net, trainloader, epochs): | |
# optimizer = AdamW(net.parameters(), lr=5e-5) | |
# net.train() | |
# for _ in range(epochs): | |
# for batch in trainloader: | |
# batch = {k: v.to(DEVICE) for k, v in batch.items()} | |
# outputs = net(**batch) | |
# loss = outputs.loss | |
# loss.backward() | |
# optimizer.step() | |
# optimizer.zero_grad() | |
# def test(net, testloader): | |
# metric = load_metric("accuracy") | |
# net.eval() | |
# loss = 0 | |
# for batch in testloader: | |
# batch = {k: v.to(DEVICE) for k, v in batch.items()} | |
# with torch.no_grad(): | |
# outputs = net(**batch) | |
# logits = outputs.logits | |
# loss += outputs.loss.item() | |
# predictions = torch.argmax(logits, dim=-1) | |
# metric.add_batch(predictions=predictions, references=batch["labels"]) | |
# loss /= len(testloader) | |
# accuracy = metric.compute()["accuracy"] | |
# return loss, accuracy | |
# class CustomClient(fl.client.NumPyClient): | |
# def __init__(self, net, trainloader, testloader, client_id): | |
# self.net = net | |
# self.trainloader = trainloader | |
# self.testloader = testloader | |
# self.client_id = client_id | |
# self.losses = [] | |
# self.accuracies = [] | |
# def get_parameters(self, config): | |
# return [val.cpu().numpy() for _, val in self.net.state_dict().items()] | |
# def set_parameters(self, parameters): | |
# params_dict = zip(self.net.state_dict().keys(), parameters) | |
# state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) | |
# self.net.load_state_dict(state_dict, strict=True) | |
# def fit(self, parameters, config): | |
# self.set_parameters(parameters) | |
# train(self.net, self.trainloader, epochs=1) | |
# loss, accuracy = test(self.net, self.testloader) | |
# self.losses.append(loss) | |
# self.accuracies.append(accuracy) | |
# return self.get_parameters(config={}), len(self.trainloader.dataset), {} | |
# def evaluate(self, parameters, config): | |
# self.set_parameters(parameters) | |
# loss, accuracy = test(self.net, self.testloader) | |
# return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} | |
# def plot_metrics(self, round_num, plot_placeholder): | |
# if self.losses and self.accuracies: | |
# plot_placeholder.write(f"#### Client {self.client_id} Metrics for Round {round_num}") | |
# plot_placeholder.write(f"Loss: {self.losses[-1]:.4f}") | |
# plot_placeholder.write(f"Accuracy: {self.accuracies[-1]:.4f}") | |
# fig, ax1 = plt.subplots() | |
# color = 'tab:red' | |
# ax1.set_xlabel('Round') | |
# ax1.set_ylabel('Loss', color=color) | |
# ax1.plot(range(1, len(self.losses) + 1), self.losses, color=color) | |
# ax1.tick_params(axis='y', labelcolor=color) | |
# ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis | |
# color = 'tab:blue' | |
# ax2.set_ylabel('Accuracy', color=color) | |
# ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, color=color) | |
# ax2.tick_params(axis='y', labelcolor=color) | |
# fig.tight_layout() | |
# plot_placeholder.pyplot(fig) | |
# def main(): | |
# st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") | |
# dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) | |
# model_name = st.selectbox("Model", ["bert-base-uncased","facebook/hubert-base-ls960", "distilbert-base-uncased"]) | |
# NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) | |
# NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) | |
# train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) | |
# trainloaders = [] | |
# testloaders = [] | |
# clients = [] | |
# for i in range(NUM_CLIENTS): | |
# st.write(f"### Client {i+1} Datasets") | |
# train_df = pd.DataFrame(train_datasets[i]) | |
# test_df = pd.DataFrame(test_datasets[i]) | |
# st.write("#### Train Dataset") | |
# edited_train_df = st.data_editor(train_df, key=f"train_{i}") | |
# st.write("#### Test Dataset") | |
# edited_test_df = st.data_editor(test_df, key=f"test_{i}") | |
# edited_train_dataset = Dataset.from_pandas(edited_train_df) | |
# edited_test_dataset = Dataset.from_pandas(edited_test_df) | |
# trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) | |
# testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) | |
# trainloaders.append(trainloader) | |
# testloaders.append(testloader) | |
# net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) | |
# client = CustomClient(net, trainloader, testloader, client_id=i+1) | |
# clients.append(client) | |
# if st.button("Start Training"): | |
# def client_fn(cid): | |
# return clients[int(cid)] | |
# def weighted_average(metrics): | |
# accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] | |
# losses = [num_examples * m["loss"] for num_examples, m in metrics] | |
# examples = [num_examples for num_examples, _ in metrics] | |
# return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} | |
# strategy = fl.server.strategy.FedAvg( | |
# fraction_fit=1.0, | |
# fraction_evaluate=1.0, | |
# evaluate_metrics_aggregation_fn=weighted_average, | |
# ) | |
# for round_num in range(NUM_ROUNDS): | |
# st.write(f"### Round {round_num + 1}") | |
# plot_placeholders = [st.empty() for _ in range(NUM_CLIENTS)] | |
# fl.common.logger.configure(identifier="myFlowerExperiment", filename="./log.txt") | |
# fl.simulation.start_simulation( | |
# client_fn=client_fn, | |
# num_clients=NUM_CLIENTS, | |
# config=fl.server.ServerConfig(num_rounds=1), | |
# strategy=strategy, | |
# client_resources={"num_cpus": 1, "num_gpus": 0}, | |
# ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} | |
# ) | |
# for i, client in enumerate(clients): | |
# st.markdown("LOGS : "+ read_log_file()) | |
# client.plot_metrics(round_num + 1, plot_placeholders[i]) | |
# st.write(" ") | |
# st.success("Training completed successfully!") | |
# # Display final metrics | |
# st.write("## Final Client Metrics") | |
# for client in clients: | |
# st.write(f"### Client {client.client_id}") | |
# st.write(f"Final Loss: {client.losses[-1]:.4f}") | |
# st.write(f"Final Accuracy: {client.accuracies[-1]:.4f}") | |
# client.plot_metrics(NUM_ROUNDS, st.empty()) | |
# st.write(" ") | |
# else: | |
# st.write("Click the 'Start Training' button to start the training process.") | |
# if __name__ == "__main__": | |
# main() | |
# %%writefile app.py | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
import torch | |
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW | |
from datasets import load_dataset, Dataset | |
from evaluate import load as load_metric | |
from torch.utils.data import DataLoader | |
import pandas as pd | |
import random | |
from collections import OrderedDict | |
import flwr as fl | |
from logging import INFO, DEBUG | |
from flwr.common.logger import log | |
DEVICE = torch.device("cpu") | |
def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): | |
raw_datasets = load_dataset(dataset_name) | |
raw_datasets = raw_datasets.shuffle(seed=42) | |
del raw_datasets["unsupervised"] | |
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") | |
def tokenize_function(examples): | |
return tokenizer(examples["text"], truncation=True) | |
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
tokenized_datasets = tokenized_datasets.remove_columns("text") | |
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") | |
train_datasets = [] | |
test_datasets = [] | |
for _ in range(num_clients): | |
train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) | |
test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) | |
train_datasets.append(train_dataset) | |
test_datasets.append(test_dataset) | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
return train_datasets, test_datasets, data_collator, raw_datasets | |
def train(net, trainloader, epochs): | |
optimizer = AdamW(net.parameters(), lr=5e-5) | |
net.train() | |
for _ in range(epochs): | |
for batch in trainloader: | |
batch = {k: v.to(DEVICE) for k, v in batch.items()} | |
outputs = net(**batch) | |
loss = outputs.loss | |
loss.backward() | |
optimizer.step() | |
optimizer.zero_grad() | |
def test(net, testloader | |