CarolXia commited on
Commit
13491a5
·
1 Parent(s): 5418ef7
Files changed (1) hide show
  1. app.py +170 -134
app.py CHANGED
@@ -1,83 +1,67 @@
1
  import streamlit as st
2
- # from gliner import GLiNER
3
  from datasets import load_dataset
4
- import evaluate
5
  import numpy as np
6
- import threading
7
- import time
8
- from peft import prepare_model_for_kbit_training
9
- from peft import LoraConfig, get_peft_model, TaskType
10
  import torch
11
- from torch.profiler import profile, record_function, ProfilerActivity
12
- from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments
 
13
 
 
 
14
 
15
- seqeval = evaluate.load("seqeval")
16
 
17
- # id2label = {0: "O"}
18
- # label2id = {"O": 0}
19
- # def build_id2label(examples):
20
- # for i, label in enumerate(examples["mbert_token_classes"]):
21
- # if label.startswith("I-") and label not in label2id:
22
- # current_len = len(id2label)
23
- # id2label[current_len] = label
24
- # label2id[label] = current_len
25
 
 
26
  print(f"Is CUDA available: {torch.cuda.is_available()}")
27
  # True
28
  if torch.cuda.is_available():
29
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 
30
 
31
- # Load the fine-tuned GLiNER model
32
  st.write('Loading the pretrained model ...')
33
- model_name = "iiiorg/piiranha-v1-detect-personal-information"
34
- model = AutoModelForTokenClassification.from_pretrained(model_name)
35
- tokenizer = AutoTokenizer.from_pretrained(model_name)
36
-
37
- print(model)
38
-
39
- # Prepare model for LoRA training
40
- model.train() # model in evaluation mode (dropout modules are activated)
41
- # enable gradient check pointing
42
- model.gradient_checkpointing_enable()
43
-
44
- # enable quantized training
45
- model = prepare_model_for_kbit_training(model)
46
-
47
- # LoRA config
48
- config = LoraConfig(
49
- r=8,
50
- lora_alpha=32,
51
- target_modules=["query_proj"],
52
- lora_dropout=0.05,
53
- bias="none",
54
- task_type=TaskType.TOKEN_CLS
55
- )
56
-
57
- # LoRA trainable version of model
58
- model = get_peft_model(model, config)
59
-
60
- print(model)
61
- # trainable parameter count
62
- model.print_trainable_parameters()
63
-
64
- # # print weights
65
- # pytorch_total_params = sum(p.numel() for p in model.parameters())
66
- # torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
67
- # print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
68
 
69
  if torch.cuda.is_available():
70
- model = model.to("cuda")
 
71
 
72
  # Load data.
73
- raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train[1:1000]')
74
- # raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
 
75
  raw_dataset = raw_dataset.train_test_split(test_size=0.2)
76
  print(raw_dataset)
77
  print(raw_dataset.column_names)
78
- # raw_dataset = raw_dataset.select_columns(["mbert_tokens"])
79
- # raw_dataset = raw_dataset.rename_column("mbert_tokens", "tokens")
80
- # raw_dataset = raw_dataset.rename_column("mbert_token_classes", "labels")
81
 
82
  # inputs = tokenizer(
83
  # raw_dataset['train'][0]['mbert_tokens'],
@@ -87,17 +71,6 @@ print(raw_dataset.column_names)
87
  # print(inputs.tokens())
88
  # print(inputs.word_ids())
89
 
90
- # Build label2id and id2label
91
- st.write("Building label mappings")
92
- label2id = model.config.label2id
93
- id2label = model.config.id2label
94
- # raw_dataset.map(
95
- # build_id2label,
96
- # batched=False)
97
-
98
- st.write("id2label: ", model.config.id2label)
99
- st.write("label2id: ", model.config.label2id)
100
-
101
  # function to align labels with tokens
102
  # --> special tokens: -100 label id (ignored by cross entropy),
103
  # --> if tokens are inside a word, replace 'B-' with 'I-'
@@ -132,81 +105,144 @@ def tokenize_function(examples):
132
  tokenized_data = raw_dataset.map(
133
  tokenize_function,
134
  batched=True)
 
135
  # data collator
136
  data_collator = DataCollatorForTokenClassification(tokenizer)
137
 
138
  st.write(tokenized_data["train"][:2]["labels"])
139
 
140
- import os
141
-
142
- # Print all CUDA environment variables
143
- for key, value in os.environ.items():
144
- if "CUDA" in key.upper():
145
- print(f"{key}={value}")
146
-
147
- def compute_metrics(eval_preds):
148
- logits, labels = eval_preds
149
- predictions = np.argmax(logits, axis=-1)
150
-
151
- # Remove ignored index (special tokens) and convert to labels
152
- true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
153
- true_predictions = [
154
- [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
155
- for prediction, label in zip(predictions, labels)
156
- ]
157
- all_metrics = seqeval.compute(predictions=true_predictions, references=true_labels)
158
- return {
159
- "precision": all_metrics["overall_precision"],
160
- "recall": all_metrics["overall_recall"],
161
- "f1": all_metrics["overall_f1"],
162
- "accuracy": all_metrics["overall_accuracy"],
163
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # hyperparameters
166
- lr = 2e-4
167
- batch_size = 4
168
- num_epochs = 4
169
- output_dir = "xia-lora-deberta-v2"
170
-
171
- # define training arguments
172
- training_args = TrainingArguments(
173
- output_dir= output_dir,
174
- learning_rate=lr,
175
- per_device_train_batch_size=batch_size,
176
- per_device_eval_batch_size=batch_size,
177
- num_train_epochs=num_epochs,
178
- weight_decay=0.01,
179
- logging_strategy="epoch",
180
- evaluation_strategy="epoch",
181
- save_strategy="epoch",
182
- load_best_model_at_end=True,
183
- gradient_accumulation_steps=4,
184
- warmup_steps=2,
185
- fp16=True,
186
- optim="paged_adamw_8bit",
187
- )
188
-
189
- # configure trainer
190
- trainer = Trainer(
191
- model=model,
192
- train_dataset=tokenized_data["train"],
193
- eval_dataset=tokenized_data["test"],
194
- args=training_args,
195
- data_collator=data_collator,
196
- compute_metrics=compute_metrics
197
- )
198
 
199
- # train model
200
- model.config.use_cache = False # silence the warnings. Please re-enable for inference!
201
- trainer.train()
 
 
 
 
202
 
203
- # renable warnings
204
- model.config.use_cache = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
  st.write('Pushing model to huggingface')
207
 
208
  # Push model to huggingface
209
  hf_name = 'CarolXia' # your hf username or org name
210
- model_id = hf_name + "/" + output_dir
211
- model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
212
- trainer.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])
 
1
  import streamlit as st
2
+
3
  from datasets import load_dataset
4
+
5
  import numpy as np
6
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
7
+
 
 
8
  import torch
9
+ import torch.nn as nn
10
+ import torch.optim as optim
11
+ from torch.utils.data import DataLoader
12
 
13
+ from transformers import AutoModelForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification
14
+ from transformers import DebertaV2Config, DebertaV2ForTokenClassification
15
 
 
16
 
17
+ # print weights
18
+ def print_trainable_parameters(model):
19
+ pytorch_total_params = sum(p.numel() for p in model.parameters())
20
+ torch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
21
+ print(f'total params: {pytorch_total_params}. tunable params: {torch_total_params}')
 
 
 
22
 
23
+ device = torch.device('cpu')
24
  print(f"Is CUDA available: {torch.cuda.is_available()}")
25
  # True
26
  if torch.cuda.is_available():
27
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
28
+ device = torch.device('cuda')
29
 
30
+ # Load models
31
  st.write('Loading the pretrained model ...')
32
+ teacher_model_name = "iiiorg/piiranha-v1-detect-personal-information"
33
+ teacher_model = AutoModelForTokenClassification.from_pretrained(teacher_model_name)
34
+ tokenizer = AutoTokenizer.from_pretrained(teacher_model_name)
35
+ print(teacher_model)
36
+ print_trainable_parameters(teacher_model)
37
+ label2id = teacher_model.config.label2id
38
+ id2label = teacher_model.config.id2label
39
+
40
+ st.write("id2label: ", id2label)
41
+ st.write("label2id: ", label2id)
42
+ dimension = len(id2label)
43
+ st.write("dimension", dimension)
44
+
45
+ student_model_config = teacher_model.config
46
+ student_model_config.num_attention_heads = 6
47
+ student_model_config.num_hidden_layers = 4
48
+ student_model = DebertaV2ForTokenClassification.from_pretrained(
49
+ "microsoft/deberta-v3-base",
50
+ config=student_model_config)
51
+ print(student_model)
52
+ print_trainable_parameters(student_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  if torch.cuda.is_available():
55
+ teacher_model = teacher_model.to(device)
56
+ student_model = student_model.to(device)
57
 
58
  # Load data.
59
+ raw_dataset = load_dataset("ai4privacy/pii-masking-400k", split='train')
60
+ raw_dataset = raw_dataset.filter(lambda example: example["language"].startswith("en"))
61
+ raw_dataset = raw_dataset.select(range(2000, 4000))
62
  raw_dataset = raw_dataset.train_test_split(test_size=0.2)
63
  print(raw_dataset)
64
  print(raw_dataset.column_names)
 
 
 
65
 
66
  # inputs = tokenizer(
67
  # raw_dataset['train'][0]['mbert_tokens'],
 
71
  # print(inputs.tokens())
72
  # print(inputs.word_ids())
73
 
 
 
 
 
 
 
 
 
 
 
 
74
  # function to align labels with tokens
75
  # --> special tokens: -100 label id (ignored by cross entropy),
76
  # --> if tokens are inside a word, replace 'B-' with 'I-'
 
105
  tokenized_data = raw_dataset.map(
106
  tokenize_function,
107
  batched=True)
108
+ tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
109
  # data collator
110
  data_collator = DataCollatorForTokenClassification(tokenizer)
111
 
112
  st.write(tokenized_data["train"][:2]["labels"])
113
 
114
+ # Function to evaluate model performance
115
+ def evaluate_model(model, dataloader, device):
116
+ model.eval() # Set model to evaluation mode
117
+ all_preds = []
118
+ all_labels = []
119
+
120
+ # Disable gradient calculations
121
+ with torch.no_grad():
122
+ for batch in dataloader:
123
+ input_ids = batch['input_ids'].to(device)
124
+ attention_mask = batch['attention_mask'].to(device)
125
+ labels = batch['labels'].to(device)
126
+
127
+ # Forward pass to get logits
128
+ outputs = model(input_ids, attention_mask=attention_mask)
129
+ logits = outputs.logits
130
+
131
+ # Get predictions
132
+ preds = torch.argmax(logits, dim=-1).cpu().numpy()
133
+ all_preds.extend(preds)
134
+ all_labels.extend(labels.cpu().numpy())
135
+
136
+ # Calculate evaluation metrics
137
+ print("evaluate_model sizes")
138
+ print(len(all_preds[0]))
139
+ print(len(all_labels[0]))
140
+ all_preds = np.asarray(all_preds, dtype=np.float32)
141
+ all_labels = np.asarray(all_labels, dtype=np.float32)
142
+ print("Flattened sizes")
143
+ print(all_preds.size)
144
+ print(all_labels.size)
145
+ all_preds = all_preds.flatten()
146
+ all_labels = all_labels.flatten()
147
+ accuracy = accuracy_score(all_labels, all_preds)
148
+ precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='micro')
149
+
150
+ return accuracy, precision, recall, f1
151
+
152
+ # Function to compute distillation and hard-label loss
153
+ def distillation_loss(student_logits, teacher_logits, true_labels, temperature, alpha):
154
+ # print("Distillation loss sizes")
155
+ # print(teacher_logits.size())
156
+ # print(student_logits.size())
157
+ # print(true_labels.size())
158
+ # Compute soft targets from teacher logits
159
+ soft_targets = nn.functional.softmax(teacher_logits / temperature, dim=-1)
160
+ student_soft = nn.functional.log_softmax(student_logits / temperature, dim=-1)
161
+
162
+ # KL Divergence loss for distillation
163
+ distill_loss = nn.functional.kl_div(student_soft, soft_targets, reduction='batchmean') * (temperature ** 2)
164
+
165
+ # Cross-entropy loss for hard labels
166
+ student_logit_reshape = torch.transpose(student_logits, 1, 2) # transpose to match the labels dimension
167
+ hard_loss = nn.CrossEntropyLoss()(student_logit_reshape, true_labels)
168
+
169
+ # Combine losses
170
+ loss = alpha * distill_loss + (1.0 - alpha) * hard_loss
171
+
172
+ return loss
173
 
174
  # hyperparameters
175
+ batch_size = 32
176
+ lr = 1e-4
177
+ num_epochs = 10
178
+ temperature = 2.0
179
+ alpha = 0.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
+ # define optimizer
182
+ optimizer = optim.Adam(student_model.parameters(), lr=lr)
183
+
184
+ # create training data loader
185
+ dataloader = DataLoader(tokenized_data['train'], batch_size=batch_size, collate_fn=data_collator)
186
+ # create testing data loader
187
+ test_dataloader = DataLoader(tokenized_data['test'], batch_size=batch_size, collate_fn=data_collator)
188
 
189
+ # put student model in train mode
190
+ student_model.train()
191
+
192
+ # train model
193
+ for epoch in range(num_epochs):
194
+ for batch in dataloader:
195
+ # Prepare inputs
196
+ input_ids = batch['input_ids'].to(device)
197
+ attention_mask = batch['attention_mask'].to(device)
198
+ labels = batch['labels'].to(device)
199
+
200
+ # Disable gradient calculation for teacher model
201
+ with torch.no_grad():
202
+ teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask)
203
+ teacher_logits = teacher_outputs.logits
204
+
205
+ # Forward pass through the student model
206
+ student_outputs = student_model(input_ids, attention_mask=attention_mask)
207
+ student_logits = student_outputs.logits
208
+
209
+ # Compute the distillation loss
210
+ loss = distillation_loss(student_logits, teacher_logits, labels, temperature, alpha)
211
+
212
+ # Backpropagation
213
+ optimizer.zero_grad()
214
+ loss.backward()
215
+ optimizer.step()
216
+
217
+ print(f"Epoch {epoch + 1} completed with loss: {loss.item()}")
218
+
219
+ # Evaluate the teacher model
220
+ teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, test_dataloader, device)
221
+ print(f"Teacher (test) - Accuracy: {teacher_accuracy:.4f}, Precision: {teacher_precision:.4f}, Recall: {teacher_recall:.4f}, F1 Score: {teacher_f1:.4f}")
222
+
223
+ # Evaluate the student model
224
+ student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, test_dataloader, device)
225
+ print(f"Student (test) - Accuracy: {student_accuracy:.4f}, Precision: {student_precision:.4f}, Recall: {student_recall:.4f}, F1 Score: {student_f1:.4f}")
226
+ print("\n")
227
+
228
+ # put student model back into train mode
229
+ student_model.train()
230
+
231
+ #Compare the models
232
+ # create testing data loader
233
+ validation_dataloader = DataLoader(tokenized_data['test'], batch_size=8, collate_fn=data_collator)
234
+ # Evaluate the teacher model
235
+ teacher_accuracy, teacher_precision, teacher_recall, teacher_f1 = evaluate_model(teacher_model, validation_dataloader, device)
236
+ print(f"Teacher (validation) - Accuracy: {teacher_accuracy:.4f}, Precision: {teacher_precision:.4f}, Recall: {teacher_recall:.4f}, F1 Score: {teacher_f1:.4f}")
237
+ # Evaluate the student model
238
+ student_accuracy, student_precision, student_recall, student_f1 = evaluate_model(student_model, validation_dataloader, device)
239
+ print(f"Student (validation) - Accuracy: {student_accuracy:.4f}, Precision: {student_precision:.4f}, Recall: {student_recall:.4f}, F1 Score: {student_f1:.4f}")
240
+
241
 
242
  st.write('Pushing model to huggingface')
243
 
244
  # Push model to huggingface
245
  hf_name = 'CarolXia' # your hf username or org name
246
+ mode_name = "pii-kd-deberta-v2"
247
+ model_id = hf_name + "/" + mode_name
248
+ student_model.push_to_hub(model_id, token=st.secrets["HUGGINGFACE_TOKEN"])