# %%writefile app.py import streamlit as st import matplotlib.pyplot as plt import torch from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW from datasets import load_dataset, Dataset from evaluate import load as load_metric from torch.utils.data import DataLoader import pandas as pd import random from collections import OrderedDict import flwr as fl DEVICE = torch.device("cpu") def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): raw_datasets = load_dataset(dataset_name) raw_datasets = raw_datasets.shuffle(seed=42) del raw_datasets["unsupervised"] tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") def tokenize_function(examples): return tokenizer(examples["text"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns("text") tokenized_datasets = tokenized_datasets.rename_column("label", "labels") train_datasets = [] test_datasets = [] for _ in range(num_clients): train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) train_datasets.append(train_dataset) test_datasets.append(test_dataset) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) return train_datasets, test_datasets, data_collator def read_log_file(): with open("./log.txt", "r") as file: return file.read() def train(net, trainloader, epochs): optimizer = AdamW(net.parameters(), lr=5e-5) net.train() for _ in range(epochs): for batch in trainloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} outputs = net(**batch) loss = outputs.loss loss.backward() optimizer.step() optimizer.zero_grad() def test(net, testloader): metric = load_metric("accuracy") net.eval() loss = 0 for batch in testloader: batch = {k: v.to(DEVICE) for k, v in batch.items()} with torch.no_grad(): outputs = net(**batch) logits = outputs.logits loss += outputs.loss.item() predictions = torch.argmax(logits, dim=-1) metric.add_batch(predictions=predictions, references=batch["labels"]) loss /= len(testloader) accuracy = metric.compute()["accuracy"] return loss, accuracy class CustomClient(fl.client.NumPyClient): def __init__(self, net, trainloader, testloader, client_id): self.net = net self.trainloader = trainloader self.testloader = testloader self.client_id = client_id self.losses = [] self.accuracies = [] def get_parameters(self, config): return [val.cpu().numpy() for _, val in self.net.state_dict().items()] def set_parameters(self, parameters): params_dict = zip(self.net.state_dict().keys(), parameters) state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) self.net.load_state_dict(state_dict, strict=True) def fit(self, parameters, config): self.set_parameters(parameters) train(self.net, self.trainloader, epochs=1) loss, accuracy = test(self.net, self.testloader) self.losses.append(loss) self.accuracies.append(accuracy) return self.get_parameters(config={}), len(self.trainloader.dataset), {} def evaluate(self, parameters, config): self.set_parameters(parameters) loss, accuracy = test(self.net, self.testloader) return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} def plot_metrics(self, round_num, plot_placeholder): if self.losses and self.accuracies: plot_placeholder.write(f"#### Client {self.client_id} Metrics for Round {round_num}") plot_placeholder.write(f"Loss: {self.losses[-1]:.4f}") plot_placeholder.write(f"Accuracy: {self.accuracies[-1]:.4f}") fig, ax1 = plt.subplots() color = 'tab:red' ax1.set_xlabel('Round') ax1.set_ylabel('Loss', color=color) ax1.plot(range(1, len(self.losses) + 1), self.losses, color=color) ax1.tick_params(axis='y', labelcolor=color) ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis color = 'tab:blue' ax2.set_ylabel('Accuracy', color=color) ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, color=color) ax2.tick_params(axis='y', labelcolor=color) fig.tight_layout() plot_placeholder.pyplot(fig) def main(): st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) model_name = st.selectbox("Model", ["bert-base-uncased","facebook/hubert-base-ls960", "distilbert-base-uncased"]) NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) trainloaders = [] testloaders = [] clients = [] for i in range(NUM_CLIENTS): st.write(f"### Client {i+1} Datasets") train_df = pd.DataFrame(train_datasets[i]) test_df = pd.DataFrame(test_datasets[i]) st.write("#### Train Dataset") edited_train_df = st.data_editor(train_df, key=f"train_{i}") st.write("#### Test Dataset") edited_test_df = st.data_editor(test_df, key=f"test_{i}") edited_train_dataset = Dataset.from_pandas(edited_train_df) edited_test_dataset = Dataset.from_pandas(edited_test_df) trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) trainloaders.append(trainloader) testloaders.append(testloader) net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) client = CustomClient(net, trainloader, testloader, client_id=i+1) clients.append(client) if st.button("Start Training"): def client_fn(cid): return clients[int(cid)] def weighted_average(metrics): accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] losses = [num_examples * m["loss"] for num_examples, m in metrics] examples = [num_examples for num_examples, _ in metrics] return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} strategy = fl.server.strategy.FedAvg( fraction_fit=1.0, fraction_evaluate=1.0, evaluate_metrics_aggregation_fn=weighted_average, ) for round_num in range(NUM_ROUNDS): st.write(f"### Round {round_num + 1}") plot_placeholders = [st.empty() for _ in range(NUM_CLIENTS)] fl.common.logger.configure(identifier="myFlowerExperiment", filename="./log.txt") fl.simulation.start_simulation( client_fn=client_fn, num_clients=NUM_CLIENTS, config=fl.server.ServerConfig(num_rounds=1), strategy=strategy, client_resources={"num_cpus": 1, "num_gpus": 0}, ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} ) for i, client in enumerate(clients): st.markdown("LOGS : "+ read_log_file()) client.plot_metrics(round_num + 1, plot_placeholders[i]) st.write(" ") st.success("Training completed successfully!") # Display final metrics st.write("## Final Client Metrics") for client in clients: st.write(f"### Client {client.client_id}") st.write(f"Final Loss: {client.losses[-1]:.4f}") st.write(f"Final Accuracy: {client.accuracies[-1]:.4f}") client.plot_metrics(NUM_ROUNDS, st.empty()) st.write(" ") else: st.write("Click the 'Start Training' button to start the training process.") if __name__ == "__main__": main() # ############# # # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset, Dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import pandas as pd # import random # from collections import OrderedDict # import flwr as fl # from logging import INFO, DEBUG # from flwr.common.logger import log # DEVICE = torch.device("cpu") # def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # train_datasets = [] # test_datasets = [] # for _ in range(num_clients): # train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) # test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) # train_datasets.append(train_dataset) # test_datasets.append(test_dataset) # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # return train_datasets, test_datasets, data_collator # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # net.eval() # loss = 0 # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # class CustomClient(fl.client.NumPyClient): # def __init__(self, net, trainloader, testloader, client_id): # self.net = net # self.trainloader = trainloader # self.testloader = testloader # self.client_id = client_id # self.losses = [] # self.accuracies = [] # def get_parameters(self, config): # return [val.cpu().numpy() for _, val in self.net.state_dict().items()] # def set_parameters(self, parameters): # params_dict = zip(self.net.state_dict().keys(), parameters) # state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) # self.net.load_state_dict(state_dict, strict=True) # def fit(self, parameters, config): # log(INFO, f"Client {self.client_id} is starting fit()") # self.set_parameters(parameters) # train(self.net, self.trainloader, epochs=1) # loss, accuracy = test(self.net, self.testloader) # self.losses.append(loss) # self.accuracies.append(accuracy) # log(INFO, f"Client {self.client_id} finished fit() with loss: {loss:.4f} and accuracy: {accuracy:.4f}") # return self.get_parameters(config={}), len(self.trainloader.dataset), {} # def evaluate(self, parameters, config): # log(INFO, f"Client {self.client_id} is starting evaluate()") # self.set_parameters(parameters) # loss, accuracy = test(self.net, self.testloader) # log(INFO, f"Client {self.client_id} finished evaluate() with loss: {loss:.4f} and accuracy: {accuracy:.4f}") # return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} # def plot_metrics(self, round_num, plot_placeholder): # if self.losses and self.accuracies: # plot_placeholder.write(f"#### Client {self.client_id} Metrics for Round {round_num}") # plot_placeholder.write(f"Loss: {self.losses[-1]:.4f}") # plot_placeholder.write(f"Accuracy: {self.accuracies[-1]:.4f}") # fig, ax1 = plt.subplots() # color = 'tab:red' # ax1.set_xlabel('Round') # ax1.set_ylabel('Loss', color=color) # ax1.plot(range(1, len(self.losses) + 1), self.losses, color=color) # ax1.tick_params(axis='y', labelcolor=color) # ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis # color = 'tab:blue' # ax2.set_ylabel('Accuracy', color=color) # ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, color=color) # ax2.tick_params(axis='y', labelcolor=color) # fig.tight_layout() # plot_placeholder.pyplot(fig) # def read_log_file(): # with open("log.txt", "r") as file: # return file.read() # def main(): # st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") # dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased", "facebook/hubert-base-ls960", "distilbert-base-uncased"]) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) # trainloaders = [] # testloaders = [] # clients = [] # for i in range(NUM_CLIENTS): # st.write(f"### Client {i+1} Datasets") # train_df = pd.DataFrame(train_datasets[i]) # test_df = pd.DataFrame(test_datasets[i]) # st.write("#### Train Dataset") # edited_train_df = st.data_editor(train_df, key=f"train_{i}") # st.write("#### Test Dataset") # edited_test_df = st.data_editor(test_df, key=f"test_{i}") # edited_train_dataset = Dataset.from_pandas(edited_train_df) # edited_test_dataset = Dataset.from_pandas(edited_test_df) # trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) # trainloaders.append(trainloader) # testloaders.append(testloader) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # client = CustomClient(net, trainloader, testloader, client_id=i+1) # clients.append(client) # if st.button("Start Training"): # def client_fn(cid): # return clients[int(cid)] # def weighted_average(metrics): # accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] # losses = [num_examples * m["loss"] for num_examples, m in metrics] # examples = [num_examples for num_examples, _ in metrics] # return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} # strategy = fl.server.strategy.FedAvg( # fraction_fit=1.0, # fraction_evaluate=1.0, # evaluate_metrics_aggregation_fn=weighted_average, # ) # for round_num in range(NUM_ROUNDS): # st.write(f"### Round {round_num + 1}") # plot_placeholders = [st.empty() for _ in range(NUM_CLIENTS)] # fl.simulation.start_simulation( # client_fn=client_fn, # num_clients=NUM_CLIENTS, # config=fl.server.ServerConfig(num_rounds=1), # strategy=strategy, # client_resources={"num_cpus": 1, "num_gpus": 0}, # ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} # ) # for i, client in enumerate(clients): # client.plot_metrics(round_num + 1, plot_placeholders[i]) # st.write(" ") # st.success("Training completed successfully!") # # Display final metrics # st.write("## Final Client Metrics") # for client in clients: # st.write(f"### Client {client.client_id}") # st.write(f"Final Loss: {client.losses[-1]:.4f}") # st.write(f"Final Accuracy: {client.accuracies[-1]:.4f}") # client.plot_metrics(NUM_ROUNDS, st.empty()) # st.write(" ") # # Display log.txt content # st.write("## Training Log") # st.text(read_log_file()) # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() ############# # # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset, Dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import pandas as pd # import random # import warnings # from collections import OrderedDict # import flwr as fl # DEVICE = torch.device("cpu") # def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # train_datasets = [] # test_datasets = [] # for _ in range(num_clients): # train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) # test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) # train_datasets.append(train_dataset) # test_datasets.append(test_dataset) # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # return train_datasets, test_datasets, data_collator # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # net.eval() # loss = 0 # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # class CustomClient(fl.client.NumPyClient): # def __init__(self, net, trainloader, testloader, client_id): # self.net = net # self.trainloader = trainloader # self.testloader = testloader # self.client_id = client_id # self.losses = [] # self.accuracies = [] # def get_parameters(self, config): # return [val.cpu().numpy() for _, val in self.net.state_dict().items()] # def set_parameters(self, parameters): # params_dict = zip(self.net.state_dict().keys(), parameters) # state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) # self.net.load_state_dict(state_dict, strict=True) # def fit(self, parameters, config): # self.set_parameters(parameters) # train(self.net, self.trainloader, epochs=1) # loss, accuracy = test(self.net, self.testloader) # self.losses.append(loss) # self.accuracies.append(accuracy) # return self.get_parameters(config={}), len(self.trainloader.dataset), {} # def evaluate(self, parameters, config): # self.set_parameters(parameters) # loss, accuracy = test(self.net, self.testloader) # return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} # def plot_metrics(self, round_num): # if self.losses and self.accuracies: # st.write(f"#### Client {self.client_id} Metrics for Round {round_num}") # st.write(f"Loss: {self.losses[-1]:.4f}") # st.write(f"Accuracy: {self.accuracies[-1]:.4f}") # fig, ax1 = plt.subplots() # ax2 = ax1.twinx() # ax1.plot(range(1, len(self.losses) + 1), self.losses, 'g-') # ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, 'b-') # ax1.set_xlabel('Round') # ax1.set_ylabel('Loss', color='g') # ax2.set_ylabel('Accuracy', color='b') # plt.title(f'Client {self.client_id} Metrics') # st.pyplot(fig) # def main(): # st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") # dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased", "distilbert-base-uncased"]) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) # trainloaders = [] # testloaders = [] # clients = [] # for i in range(NUM_CLIENTS): # st.write(f"### Client {i+1} Datasets") # train_df = pd.DataFrame(train_datasets[i]) # test_df = pd.DataFrame(test_datasets[i]) # st.write("#### Train Dataset") # edited_train_df = st.experimental_data_editor(train_df, key=f"train_{i}") # st.write("#### Test Dataset") # edited_test_df = st.experimental_data_editor(test_df, key=f"test_{i}") # edited_train_dataset = Dataset.from_pandas(edited_train_df) # edited_test_dataset = Dataset.from_pandas(edited_test_df) # trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) # trainloaders.append(trainloader) # testloaders.append(testloader) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # client = CustomClient(net, trainloader, testloader, client_id=i+1) # clients.append(client) # if st.button("Start Training"): # def client_fn(cid): # return clients[int(cid)] # def weighted_average(metrics): # accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] # losses = [num_examples * m["loss"] for num_examples, m in metrics] # examples = [num_examples for num_examples, _ in metrics] # return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} # strategy = fl.server.strategy.FedAvg( # fraction_fit=1.0, # fraction_evaluate=1.0, # evaluate_metrics_aggregation_fn=weighted_average, # ) # for round_num in range(NUM_ROUNDS): # st.write(f"### Round {round_num + 1}") # fl.simulation.start_simulation( # client_fn=client_fn, # num_clients=NUM_CLIENTS, # config=fl.server.ServerConfig(num_rounds=1), # strategy=strategy, # client_resources={"num_cpus": 1, "num_gpus": 0}, # ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} # ) # for client in clients: # client.plot_metrics(round_num + 1) # st.write(" ") # st.success(f"Training completed successfully!") # # Display final metrics # st.write("## Final Client Metrics") # for client in clients: # st.write(f"### Client {client.client_id}") # st.write(f"Final Loss: {client.losses[-1]:.4f}") # st.write(f"Final Accuracy: {client.accuracies[-1]:.4f}") # client.plot_metrics(NUM_ROUNDS) # st.write(" ") # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() # # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset, Dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import pandas as pd # import random # import warnings # from collections import OrderedDict # import flwr as fl # DEVICE = torch.device("cpu") # def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # train_datasets = [] # test_datasets = [] # for _ in range(num_clients): # train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) # test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) # train_datasets.append(train_dataset) # test_datasets.append(test_dataset) # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # return train_datasets, test_datasets, data_collator # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # net.eval() # loss = 0 # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # class CustomClient(fl.client.NumPyClient): # def __init__(self, net, trainloader, testloader, client_id): # self.net = net # self.trainloader = trainloader # self.testloader = testloader # self.client_id = client_id # self.losses = [] # self.accuracies = [] # def get_parameters(self, config): # return [val.cpu().numpy() for _, val in self.net.state_dict().items()] # def set_parameters(self, parameters): # params_dict = zip(self.net.state_dict().keys(), parameters) # state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) # self.net.load_state_dict(state_dict, strict=True) # def fit(self, parameters, config): # self.set_parameters(parameters) # train(self.net, self.trainloader, epochs=1) # loss, accuracy = test(self.net, self.testloader) # self.losses.append(loss) # self.accuracies.append(accuracy) # return self.get_parameters(config={}), len(self.trainloader.dataset), {} # def evaluate(self, parameters, config): # self.set_parameters(parameters) # loss, accuracy = test(self.net, self.testloader) # return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} # def plot_metrics(self): # fig, ax1 = plt.subplots() # ax2 = ax1.twinx() # ax1.plot(range(1, len(self.losses) + 1), self.losses, 'g-') # ax2.plot(range(1, len(self.accuracies) + 1), self.accuracies, 'b-') # ax1.set_xlabel('Round') # ax1.set_ylabel('Loss', color='g') # ax2.set_ylabel('Accuracy', color='b') # plt.title(f'Client {self.client_id} Metrics') # st.pyplot(fig) # def main(): # st.write("## Federated Learning with Dynamic Models and Datasets for Mobile Devices") # dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased", "distilbert-base-uncased"]) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) # trainloaders = [] # testloaders = [] # clients = [] # for i in range(NUM_CLIENTS): # st.write(f"### Client {i+1} Datasets") # train_df = pd.DataFrame(train_datasets[i]) # test_df = pd.DataFrame(test_datasets[i]) # st.write("#### Train Dataset") # edited_train_df = st.experimental_data_editor(train_df, key=f"train_{i}") # st.write("#### Test Dataset") # edited_test_df = st.experimental_data_editor(test_df, key=f"test_{i}") # edited_train_dataset = Dataset.from_pandas(edited_train_df) # edited_test_dataset = Dataset.from_pandas(edited_test_df) # trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) # trainloaders.append(trainloader) # testloaders.append(testloader) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # client = CustomClient(net, trainloader, testloader, client_id=i+1) # clients.append(client) # if st.button("Start Training"): # def client_fn(cid): # return clients[int(cid)] # def weighted_average(metrics): # accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] # losses = [num_examples * m["loss"] for num_examples, m in metrics] # examples = [num_examples for num_examples, _ in metrics] # return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} # strategy = fl.server.strategy.FedAvg( # fraction_fit=1.0, # fraction_evaluate=1.0, # evaluate_metrics_aggregation_fn=weighted_average, # ) # fl.simulation.start_simulation( # client_fn=client_fn, # num_clients=NUM_CLIENTS, # config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), # strategy=strategy, # client_resources={"num_cpus": 1, "num_gpus": 0}, # ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} # ) # st.success(f"Training completed successfully!") # for client in clients: # st.write(f"### Client {client.client_id} Model Metrics") # client.plot_metrics() # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() # 05/2024 # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset, Dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import pandas as pd # import random # import warnings # from collections import OrderedDict # import flwr as fl # DEVICE = torch.device("cpu") # def load_data(dataset_name, train_size=20, test_size=20, num_clients=2): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # train_datasets = [] # test_datasets = [] # for _ in range(num_clients): # train_dataset = tokenized_datasets["train"].select(random.sample(range(len(tokenized_datasets["train"])), train_size)) # test_dataset = tokenized_datasets["test"].select(random.sample(range(len(tokenized_datasets["test"])), test_size)) # train_datasets.append(train_dataset) # test_datasets.append(test_dataset) # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # return train_datasets, test_datasets, data_collator # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # net.eval() # loss = 0 # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # class CustomClient(fl.client.NumPyClient): # def __init__(self, net, trainloader, testloader): # self.net = net # self.trainloader = trainloader # self.testloader = testloader # def get_parameters(self, config): # return [val.cpu().numpy() for _, val in self.net.state_dict().items()] # def set_parameters(self, parameters): # params_dict = zip(self.net.state_dict().keys(), parameters) # state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict}) # self.net.load_state_dict(state_dict, strict=True) # def fit(self, parameters, config): # self.set_parameters(parameters) # train(self.net, self.trainloader, epochs=1) # return self.get_parameters(config={}), len(self.trainloader.dataset), {} # def evaluate(self, parameters, config): # self.set_parameters(parameters) # loss, accuracy = test(self.net, self.testloader) # return float(loss), len(self.testloader.dataset), {"accuracy": float(accuracy)} # def main(): # st.write("## Federated Learning with Flower and Dynamic Models and Datasets for Mobile Devices") # dataset_name = st.selectbox("Dataset", ["imdb", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased", "distilbert-base-uncased"]) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # train_datasets, test_datasets, data_collator = load_data(dataset_name, num_clients=NUM_CLIENTS) # trainloaders = [] # testloaders = [] # for i in range(NUM_CLIENTS): # st.write(f"### Client {i+1} Datasets") # train_df = pd.DataFrame(train_datasets[i]) # test_df = pd.DataFrame(test_datasets[i]) # st.write("#### Train Dataset") # edited_train_df = st.experimental_data_editor(train_df, key=f"train_{i}") # st.write("#### Test Dataset") # edited_test_df = st.experimental_data_editor(test_df, key=f"test_{i}") # edited_train_dataset = Dataset.from_pandas(edited_train_df) # edited_test_dataset = Dataset.from_pandas(edited_test_df) # trainloader = DataLoader(edited_train_dataset, shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(edited_test_dataset, batch_size=32, collate_fn=data_collator) # trainloaders.append(trainloader) # testloaders.append(testloader) # if st.button("Start Training"): # round_losses = [] # round_accuracies = [] # clients = [CustomClient(net, trainloaders[i], testloaders[i]) for i in range(NUM_CLIENTS)] # def client_fn(cid): # return clients[int(cid)] # def weighted_average(metrics): # accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] # losses = [num_examples * m["loss"] for num_examples, m in metrics] # examples = [num_examples for num_examples, _ in metrics] # return {"accuracy": sum(accuracies) / sum(examples), "loss": sum(losses) / sum(examples)} # strategy = fl.server.strategy.FedAvg( # fraction_fit=1.0, # fraction_evaluate=1.0, # evaluate_metrics_aggregation_fn=weighted_average, # ) # fl.simulation.start_simulation( # client_fn=client_fn, # num_clients=NUM_CLIENTS, # config=fl.server.ServerConfig(num_rounds=NUM_ROUNDS), # strategy=strategy, # client_resources={"num_cpus": 1, "num_gpus": 0}, # ray_init_args={"log_to_driver": False, "num_cpus": 1, "num_gpus": 0} # ) # st.success(f"Training completed successfully!") # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() ##ORIGINAL### # # %%writefile app.py # import streamlit as st # import matplotlib.pyplot as plt # import torch # from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW # from datasets import load_dataset # from evaluate import load as load_metric # from torch.utils.data import DataLoader # import random # DEVICE = torch.device("cpu") # NUM_ROUNDS = 3 # def load_data(dataset_name): # raw_datasets = load_dataset(dataset_name) # raw_datasets = raw_datasets.shuffle(seed=42) # del raw_datasets["unsupervised"] # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # def tokenize_function(examples): # return tokenizer(examples["text"], truncation=True) # train_population = random.sample(range(len(raw_datasets["train"])), 20) # test_population = random.sample(range(len(raw_datasets["test"])), 20) # tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) # tokenized_datasets["train"] = tokenized_datasets["train"].select(train_population) # tokenized_datasets["test"] = tokenized_datasets["test"].select(test_population) # tokenized_datasets = tokenized_datasets.remove_columns("text") # tokenized_datasets = tokenized_datasets.rename_column("label", "labels") # data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # trainloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=32, collate_fn=data_collator) # testloader = DataLoader(tokenized_datasets["test"], batch_size=32, collate_fn=data_collator) # return trainloader, testloader # def train(net, trainloader, epochs): # optimizer = AdamW(net.parameters(), lr=5e-5) # net.train() # for _ in range(epochs): # for batch in trainloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # outputs = net(**batch) # loss = outputs.loss # loss.backward() # optimizer.step() # optimizer.zero_grad() # def test(net, testloader): # metric = load_metric("accuracy") # loss = 0 # net.eval() # for batch in testloader: # batch = {k: v.to(DEVICE) for k, v in batch.items()} # with torch.no_grad(): # outputs = net(**batch) # logits = outputs.logits # loss += outputs.loss.item() # predictions = torch.argmax(logits, dim=-1) # metric.add_batch(predictions=predictions, references=batch["labels"]) # loss /= len(testloader.dataset) # accuracy = metric.compute()["accuracy"] # return loss, accuracy # from transformers import Wav2Vec2Processor, HubertForSequenceClassification # import torch # def main(): # st.write("## Federated Learning with dynamic models and datasets for mobile devices") # dataset_name = st.selectbox("Dataset", ["imdb","audio_instruction_task", "amazon_polarity", "ag_news"]) # model_name = st.selectbox("Model", ["bert-base-uncased","facebook/hubert-base-ls960", "distilbert-base-uncased"]) # net = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # # processor = Wav2Vec2Processor.from_pretrained(model_name) # # net = HubertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) # # net = HubertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # NUM_CLIENTS = st.slider("Number of Clients", min_value=1, max_value=10, value=2) # NUM_ROUNDS = st.slider("Number of Rounds", min_value=1, max_value=10, value=3) # trainloader, testloader = load_data(dataset_name) # if st.button("Start Training"): # round_losses = [] # round_accuracies = [] # Store accuracy values for each round # for round_num in range(1, NUM_ROUNDS + 1): # st.write(f"## Round {round_num}") # st.write("### Training Metrics for Each Client") # for client in range(1, NUM_CLIENTS + 1): # client_loss, client_accuracy = test(net, testloader) # Placeholder for actual client metrics # st.write(f"Client {client}: Loss: {client_loss}, Accuracy: {client_accuracy}") # st.write("### Accuracy Over Rounds") # round_accuracies.append(client_accuracy) # Append the accuracy for this round # plt.plot(range(1, round_num + 1), round_accuracies, marker='o') # Plot accuracy over rounds # plt.xlabel("Round") # plt.ylabel("Accuracy") # plt.title("Accuracy Over Rounds") # st.pyplot() # st.write("### Loss Over Rounds") # loss_value = random.random() # Placeholder for loss values # round_losses.append(loss_value) # rounds = list(range(1, round_num + 1)) # plt.plot(rounds, round_losses) # plt.xlabel("Round") # plt.ylabel("Loss") # plt.title("Loss Over Rounds") # st.pyplot() # st.success(f"Round {round_num} completed successfully!") # else: # st.write("Click the 'Start Training' button to start the training process.") # if __name__ == "__main__": # main() ###ORIGINAL## # ########################TinyLLM#################################### # import torch # import torch.nn as nn # from torch.nn import functional as F # # hyperparameters # batch_size = 64 # how many independent sequences will we process in parallel? # block_size = 256 # what is the maximum context length for predictions? # max_iters = 5000 # eval_interval = 500 # learning_rate = 3e-4 # device = 'cuda' if torch.cuda.is_available() else 'cpu' # eval_iters = 200 # n_embd = 384 # n_head = 6 # n_layer = 6 # dropout = 0.2 # # ------------ # torch.manual_seed(1337) # # wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt # with open('input.txt', 'r', encoding='utf-8') as f: # text = f.read() # # here are all the unique characters that occur in this text # chars = sorted(list(set(text))) # vocab_size = len(chars) # # create a mapping from characters to integers # stoi = { ch:i for i,ch in enumerate(chars) } # itos = { i:ch for i,ch in enumerate(chars) } # encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers # decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string # # Train and test splits # data = torch.tensor(encode(text), dtype=torch.long) # n = int(0.9*len(data)) # first 90% will be train, rest val # train_data = data[:n] # val_data = data[n:] # # data loading # def get_batch(split): # # generate a small batch of data of inputs x and targets y # data = train_data if split == 'train' else val_data # ix = torch.randint(len(data) - block_size, (batch_size,)) # x = torch.stack([data[i:i+block_size] for i in ix]) # y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # x, y = x.to(device), y.to(device) # return x, y # @torch.no_grad() # def estimate_loss(): # out = {} # model.eval() # for split in ['train', 'val']: # losses = torch.zeros(eval_iters) # for k in range(eval_iters): # X, Y = get_batch(split) # logits, loss = model(X, Y) # losses[k] = loss.item() # out[split] = losses.mean() # model.train() # return out # class Head(nn.Module): # """ one head of self-attention """ # def __init__(self, head_size): # super().__init__() # self.key = nn.Linear(n_embd, head_size, bias=False) # self.query = nn.Linear(n_embd, head_size, bias=False) # self.value = nn.Linear(n_embd, head_size, bias=False) # self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) # self.dropout = nn.Dropout(dropout) # def forward(self, x): # # input of size (batch, time-step, channels) # # output of size (batch, time-step, head size) # B,T,C = x.shape # k = self.key(x) # (B,T,hs) # q = self.query(x) # (B,T,hs) # # compute attention scores ("affinities") # wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) # wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) # wei = F.softmax(wei, dim=-1) # (B, T, T) # wei = self.dropout(wei) # # perform the weighted aggregation of the values # v = self.value(x) # (B,T,hs) # out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs) # return out # class MultiHeadAttention(nn.Module): # """ multiple heads of self-attention in parallel """ # def __init__(self, num_heads, head_size): # super().__init__() # self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # self.proj = nn.Linear(head_size * num_heads, n_embd) # self.dropout = nn.Dropout(dropout) # def forward(self, x): # out = torch.cat([h(x) for h in self.heads], dim=-1) # out = self.dropout(self.proj(out)) # return out # class FeedFoward(nn.Module): # """ a simple linear layer followed by a non-linearity """ # def __init__(self, n_embd): # super().__init__() # self.net = nn.Sequential( # nn.Linear(n_embd, 4 * n_embd), # nn.ReLU(), # nn.Linear(4 * n_embd, n_embd), # nn.Dropout(dropout), # ) # def forward(self, x): # return self.net(x) # class Block(nn.Module): # """ Transformer block: communication followed by computation """ # def __init__(self, n_embd, n_head): # # n_embd: embedding dimension, n_head: the number of heads we'd like # super().__init__() # head_size = n_embd // n_head # self.sa = MultiHeadAttention(n_head, head_size) # self.ffwd = FeedFoward(n_embd) # self.ln1 = nn.LayerNorm(n_embd) # self.ln2 = nn.LayerNorm(n_embd) # def forward(self, x): # x = x + self.sa(self.ln1(x)) # x = x + self.ffwd(self.ln2(x)) # return x # class GPTLanguageModel(nn.Module): # def __init__(self): # super().__init__() # # each token directly reads off the logits for the next token from a lookup table # self.token_embedding_table = nn.Embedding(vocab_size, n_embd) # self.position_embedding_table = nn.Embedding(block_size, n_embd) # self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # self.ln_f = nn.LayerNorm(n_embd) # final layer norm # self.lm_head = nn.Linear(n_embd, vocab_size) # # better init, not covered in the original GPT video, but important, will cover in followup video # self.apply(self._init_weights) # def _init_weights(self, module): # if isinstance(module, nn.Linear): # torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # if module.bias is not None: # torch.nn.init.zeros_(module.bias) # elif isinstance(module, nn.Embedding): # torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) # def forward(self, idx, targets=None): # B, T = idx.shape # # idx and targets are both (B,T) tensor of integers # tok_emb = self.token_embedding_table(idx) # (B,T,C) # pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) # x = tok_emb + pos_emb # (B,T,C) # x = self.blocks(x) # (B,T,C) # x = self.ln_f(x) # (B,T,C) # logits = self.lm_head(x) # (B,T,vocab_size) # if targets is None: # loss = None # else: # B, T, C = logits.shape # logits = logits.view(B*T, C) # targets = targets.view(B*T) # loss = F.cross_entropy(logits, targets) # return logits, loss # def generate(self, idx, max_new_tokens): # # idx is (B, T) array of indices in the current context # for _ in range(max_new_tokens): # # crop idx to the last block_size tokens # idx_cond = idx[:, -block_size:] # # get the predictions # logits, loss = self(idx_cond) # # focus only on the last time step # logits = logits[:, -1, :] # becomes (B, C) # # apply softmax to get probabilities # probs = F.softmax(logits, dim=-1) # (B, C) # # sample from the distribution # idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) # # append sampled index to the running sequence # idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) # return idx # model = GPTLanguageModel() # m = model.to(device) # # print the number of parameters in the model # print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters') # # create a PyTorch optimizer # optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) # for iter in range(max_iters): # # every once in a while evaluate the loss on train and val sets # if iter % eval_interval == 0 or iter == max_iters - 1: # losses = estimate_loss() # print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}") # # sample a batch of data # xb, yb = get_batch('train') # # evaluate the loss # logits, loss = model(xb, yb) # optimizer.zero_grad(set_to_none=True) # loss.backward() # optimizer.step() # # generate from the model # context = torch.zeros((1, 1), dtype=torch.long, device=device) # print(decode(m.generate(context, max_new_tokens=500)[0].tolist())) # #open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist())) # ########################TinyLLM################################## # def main(): # st.write("## Audio Classification with HuBERT") # dataset_name = st.selectbox("Dataset", ["librispeech", "your_audio_dataset"]) # model_name = "facebook/hubert-base-ls960" # processor = Wav2Vec2Processor.from_pretrained(model_name) # net = HubertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(DEVICE) # train_dataset, test_dataset = load_data(dataset_name) # # Further implementation needed for actual data preparation and training loops # st.write("Details of further steps would be filled in based on specific requirements and dataset structure.") # if __name__ == "__main__": # main() # from transformers import Wav2Vec2FeatureExtractor, HubertForSequenceClassification # import torch # import soundfile as sf # def load_audio(file_path): # # Load an audio file, return waveform and sampling rate # waveform, sample_rate = sf.read(file_path) # return waveform, sample_rate # def prepare_dataset(data_paths): # # Dummy function to simulate loading and processing a dataset # # Replace this with actual data loading and processing logic # features = [] # labels = [] # for path, label in data_paths: # waveform, sr = load_audio(path) # input_values = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt").input_values # features.append(input_values) # labels.append(label) # return torch.cat(features, dim=0), torch.tensor(labels)