Spaces:

CarolXia
/

kd-finetune

Sleeping

App Files Files Community

CarolXia commited on Nov 26, 2024

Commit

13491a5

1 Parent(s): 5418ef7

Add KD

Browse files

Files changed (1) hide show

app.py +170 -134

app.py CHANGED Viewed

@@ -1,83 +1,67 @@
 import streamlit as st
-# from gliner import GLiNER
 from datasets import load_dataset
-import evaluate
 import numpy as np
-import threading
-import time
-from peft import prepare_model_for_kbit_training
-from peft import LoraConfig, get_peft_model, TaskType
 import torch
-from torch.profiler import profile, record_function, ProfilerActivity
-from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
-seqeval = evaluate.load("seqeval")
-# id2label = {0: "O"}
-# label2id = {"O": 0}
-# def build_id2label(examples):
-#     for i, label in enumerate(examples["mbert_token_classes"]):
-#         if label.startswith("I-") and label not in label2id:
-#             current_len = len(id2label)
-#             id2label[current_len] = label
-#             label2id[label] = current_len
 print(f"Is CUDA available: {torch.cuda.is_available()}")
 # True
 if torch.cuda.is_available():
     print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-# Load the fine-tuned GLiNER model
 st.write('Loading the pretrained model ...')
-model_name = "iiiorg/piiranha-v1-detect-personal-information"
-model = AutoModelForTokenClassification.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print(model)
-# Prepare model for LoRA training
-model.train() # model in evaluation mode (dropout modules are activated)
-# enable gradient check pointing
-model.gradient_checkpointing_enable()
-# enable quantized training
-model = prepare_model_for_kbit_training(model)
-# LoRA config
-config = LoraConfig(
-    r=8,
-    lora_alpha=32,
-    target_modules=["query_proj"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type=TaskType.TOKEN_CLS
-)
-# LoRA trainable version of model
-model = get_peft_model(model, config)
-print(model)
-# trainable parameter count
-model.print_trainable_parameters()
-# # print weights
-# pytorch_total_params = sum(p.numel() for p in model.parameters())
-# torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-# print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
 if torch.cuda.is_available():
-    model = model.to("cuda")
 # Load data.
-raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
-# raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
 raw_dataset = raw_dataset.train_test_split(test_size=0.2)
 print(raw_dataset)
 print(raw_dataset.column_names)
-# raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
-# raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
-# raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
 # inputs = tokenizer(
 #     raw_dataset['train'][0]['mbert_tokens'],
@@ -87,17 +71,6 @@ print(raw_dataset.column_names)
 # print(inputs.tokens())
 # print(inputs.word_ids())
-# Build label2id and id2label
-st.write("Building label mappings")
-label2id = model.config.label2id
-id2label = model.config.id2label
-# raw_dataset.map(
-#     build_id2label,
-#     batched=False)
-st.write("id2label: ", model.config.id2label)
-st.write("label2id: ", model.config.label2id)
 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
@@ -132,81 +105,144 @@ def tokenize_function(examples):
 tokenized_data = raw_dataset.map(
     tokenize_function,
     batched=True)
 # data collator
 data_collator = DataCollatorForTokenClassification(tokenizer)
 st.write(tokenized_data["train"][:2]["labels"])
-import os
-# Print all CUDA environment variables
-for key, value in os.environ.items():
-    if "CUDA" in key.upper():
-        print(f"{key}={value}")
-def compute_metrics(eval_preds):
-    logits, labels = eval_preds
-    predictions = np.argmax(logits, axis=-1)
-    # Remove ignored index (special tokens) and convert to labels
-    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
-    true_predictions = [
-        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
-        for prediction, label in zip(predictions, labels)
-    ]
-    all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
-    return {
-        "precision": all_metrics["overall_precision"],
-        "recall": all_metrics["overall_recall"],
-        "f1": all_metrics["overall_f1"],
-        "accuracy": all_metrics["overall_accuracy"],
-    }
 # hyperparameters
-lr = 2e-4
-batch_size = 4
-num_epochs = 4
-output_dir = "xia-lora-deberta-v2"
-# define training arguments
-training_args = TrainingArguments(
-    output_dir= output_dir,
-    learning_rate=lr,
-    per_device_train_batch_size=batch_size,
-    per_device_eval_batch_size=batch_size,
-    num_train_epochs=num_epochs,
-    weight_decay=0.01,
-    logging_strategy="epoch",
-    evaluation_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    gradient_accumulation_steps=4,
-    warmup_steps=2,
-    fp16=True,
-    optim="paged_adamw_8bit",
-)
-# configure trainer
-trainer = Trainer(
-    model=model,
-    train_dataset=tokenized_data["train"],
-    eval_dataset=tokenized_data["test"],
-    args=training_args,
-    data_collator=data_collator,
-    compute_metrics=compute_metrics
-)
-# train model
-model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
-trainer.train()
-# renable warnings
-model.config.use_cache = True
 st.write('Pushing model to huggingface')
 # Push model to huggingface
 hf_name = 'CarolXia' # your hf username or org name
-model_id = hf_name + "/" + output_dir
-model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
-trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])

 import streamlit as st
 from datasets import load_dataset
 import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
+from transformers import DebertaV2Config, DebertaV2ForTokenClassification
+# print weights
+def print_trainable_parameters(model):
+    pytorch_total_params = sum(p.numel() for p in model.parameters())
+    torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
+device = torch.device('cpu')
 print(f"Is CUDA available: {torch.cuda.is_available()}")
 # True
 if torch.cuda.is_available():
     print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+    device = torch.device('cuda')
+# Load models
 st.write('Loading the pretrained model ...')
+teacher_model_name = "iiiorg/piiranha-v1-detect-personal-information"
+teacher_model = AutoModelForTokenClassification.from_pretrained(teacher_model_name)
+tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
+print(teacher_model)
+print_trainable_parameters(teacher_model)
+label2id = teacher_model.config.label2id
+id2label = teacher_model.config.id2label
+st.write("id2label: ", id2label)
+st.write("label2id: ", label2id)
+dimension = len(id2label)
+st.write("dimension", dimension)
+student_model_config = teacher_model.config
+student_model_config.num_attention_heads = 6
+student_model_config.num_hidden_layers = 4
+student_model = DebertaV2ForTokenClassification.from_pretrained(
+    "microsoft/deberta-v3-base",
+    config=student_model_config)
+print(student_model)
+print_trainable_parameters(student_model)
 if torch.cuda.is_available():
+    teacher_model = teacher_model.to(device)
+    student_model = student_model.to(device)
 # Load data.
+raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train')
+raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
+raw_dataset = raw_dataset.select(range(2000, 4000))
 raw_dataset = raw_dataset.train_test_split(test_size=0.2)
 print(raw_dataset)
 print(raw_dataset.column_names)
 # inputs = tokenizer(
 #     raw_dataset['train'][0]['mbert_tokens'],
 # print(inputs.tokens())
 # print(inputs.word_ids())
 # function to align labels with tokens
 # --> special tokens: -100 label id (ignored by cross entropy),
 # --> if tokens are inside a word, replace 'B-' with 'I-'
 tokenized_data = raw_dataset.map(
     tokenize_function,
     batched=True)
+tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
 # data collator
 data_collator = DataCollatorForTokenClassification(tokenizer)
 st.write(tokenized_data["train"][:2]["labels"])
+# Function to evaluate model performance
+def evaluate_model(model, dataloader, device):
+    model.eval()  # Set model to evaluation mode
+    all_preds = []
+    all_labels = []
+    # Disable gradient calculations
+    with torch.no_grad():
+        for batch in dataloader:
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+            # Forward pass to get logits
+            outputs = model(input_ids, attention_mask=attention_mask)
+            logits = outputs.logits
+            # Get predictions
+            preds = torch.argmax(logits, dim=-1).cpu().numpy()
+            all_preds.extend(preds)
+            all_labels.extend(labels.cpu().numpy())
+    # Calculate evaluation metrics
+    print("evaluate_model sizes")
+    print(len(all_preds[0]))
+    print(len(all_labels[0]))
+    all_preds = np.asarray(all_preds, dtype=np.float32)
+    all_labels = np.asarray(all_labels, dtype=np.float32)
+    print("Flattened sizes")
+    print(all_preds.size)
+    print(all_labels.size)
+    all_preds = all_preds.flatten()
+    all_labels = all_labels.flatten()
+    accuracy = accuracy_score(all_labels, all_preds)
+    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
+    return accuracy, precision, recall, f1
+# Function to compute distillation and hard-label loss
+def distillation_loss(student_logits, teacher_logits, true_labels, temperature, alpha):
+    # print("Distillation loss sizes")
+    # print(teacher_logits.size())
+    # print(student_logits.size())
+    # print(true_labels.size())
+    # Compute soft targets from teacher logits
+    soft_targets = nn.functional.softmax(teacher_logits / temperature, dim=-1)
+    student_soft = nn.functional.log_softmax(student_logits / temperature, dim=-1)
+    # KL Divergence loss for distillation
+    distill_loss = nn.functional.kl_div(student_soft, soft_targets, reduction='batchmean') * (temperature ** 2)
+    # Cross-entropy loss for hard labels
+    student_logit_reshape = torch.transpose(student_logits, 1, 2) # transpose to match the labels dimension
+    hard_loss = nn.CrossEntropyLoss()(student_logit_reshape, true_labels)
+    # Combine losses
+    loss = alpha * distill_loss + (1.0 - alpha) * hard_loss
+    return loss
 # hyperparameters
+batch_size = 32
+lr = 1e-4
+num_epochs = 10
+temperature = 2.0
+alpha = 0.5
+# define optimizer
+optimizer = optim.Adam(student_model.parameters(), lr=lr)
+# create training data loader
+dataloader = DataLoader(tokenized_data['train'], batch_size=batch_size, collate_fn=data_collator)
+# create testing data loader
+test_dataloader = DataLoader(tokenized_data['test'], batch_size=batch_size, collate_fn=data_collator)
+# put student model in train mode
+student_model.train()
+# train model
+for epoch in range(num_epochs):
+    for batch in dataloader:
+        # Prepare inputs
+        input_ids = batch['input_ids'].to(device)
+        attention_mask = batch['attention_mask'].to(device)
+        labels = batch['labels'].to(device)
+        # Disable gradient calculation for teacher model
+        with torch.no_grad():
+            teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask)
+            teacher_logits = teacher_outputs.logits
+        # Forward pass through the student model
+        student_outputs = student_model(input_ids, attention_mask=attention_mask)
+        student_logits = student_outputs.logits
+        # Compute the distillation loss
+        loss = distillation_loss(student_logits, teacher_logits, labels, temperature, alpha)
+        # Backpropagation
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+    print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")
+    # Evaluate the teacher model
+    teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, test_dataloader, device)
+    print(f"Teacher (test) - Accuracy: {teacher_accuracy:.4f}, Precision: {teacher_precision:.4f}, Recall: {teacher_recall:.4f}, F1 Score: {teacher_f1:.4f}")
+    # Evaluate the student model
+    student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, test_dataloader, device)
+    print(f"Student (test) - Accuracy: {student_accuracy:.4f}, Precision: {student_precision:.4f}, Recall: {student_recall:.4f}, F1 Score: {student_f1:.4f}")
+    print("\n")
+    # put student model back into train mode
+    student_model.train()
+#Compare the models
+# create testing data loader
+validation_dataloader = DataLoader(tokenized_data['test'], batch_size=8, collate_fn=data_collator)
+# Evaluate the teacher model
+teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, validation_dataloader, device)
+print(f"Teacher (validation) - Accuracy: {teacher_accuracy:.4f}, Precision: {teacher_precision:.4f}, Recall: {teacher_recall:.4f}, F1 Score: {teacher_f1:.4f}")
+# Evaluate the student model
+student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, validation_dataloader, device)
+print(f"Student (validation) - Accuracy: {student_accuracy:.4f}, Precision: {student_precision:.4f}, Recall: {student_recall:.4f}, F1 Score: {student_f1:.4f}")
 st.write('Pushing model to huggingface')
 # Push model to huggingface
 hf_name = 'CarolXia' # your hf username or org name
+mode_name = "pii-kd-deberta-v2"
+model_id = hf_name + "/" + mode_name
+student_model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])