Commit
·
6c2e610
1
Parent(s):
80df7c4
DataLoader on inference.
Browse files- Finetune BERT.ipynb +7 -105
- tasks/text.py +37 -19
Finetune BERT.ipynb
CHANGED
@@ -10,15 +10,15 @@
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
-
"execution_count":
|
14 |
"id": "73e72549-69f2-46b5-b0f5-655777139972",
|
15 |
"metadata": {
|
16 |
"execution": {
|
17 |
-
"iopub.execute_input": "2025-01-
|
18 |
-
"iopub.status.busy": "2025-01-
|
19 |
-
"iopub.status.idle": "2025-01-
|
20 |
-
"shell.execute_reply": "2025-01-
|
21 |
-
"shell.execute_reply.started": "2025-01-
|
22 |
}
|
23 |
},
|
24 |
"outputs": [],
|
@@ -727,105 +727,7 @@
|
|
727 |
},
|
728 |
"widgets": {
|
729 |
"application/vnd.jupyter.widget-state+json": {
|
730 |
-
"state": {
|
731 |
-
"2d2b267cd60649cdb6fcce93640ba8d6": {
|
732 |
-
"model_module": "@jupyter-widgets/controls",
|
733 |
-
"model_module_version": "2.0.0",
|
734 |
-
"model_name": "HTMLModel",
|
735 |
-
"state": {
|
736 |
-
"layout": "IPY_MODEL_b3c2c88f904a424c96704cc4b9514f98",
|
737 |
-
"style": "IPY_MODEL_337bc700fce14480a640a1ae545db5f5",
|
738 |
-
"value": "model.safetensors: 100%"
|
739 |
-
}
|
740 |
-
},
|
741 |
-
"337bc700fce14480a640a1ae545db5f5": {
|
742 |
-
"model_module": "@jupyter-widgets/controls",
|
743 |
-
"model_module_version": "2.0.0",
|
744 |
-
"model_name": "HTMLStyleModel",
|
745 |
-
"state": {
|
746 |
-
"description_width": "",
|
747 |
-
"font_size": null,
|
748 |
-
"text_color": null
|
749 |
-
}
|
750 |
-
},
|
751 |
-
"40666b0d750d4caf8fbaeeef11eb58c1": {
|
752 |
-
"model_module": "@jupyter-widgets/controls",
|
753 |
-
"model_module_version": "2.0.0",
|
754 |
-
"model_name": "ProgressStyleModel",
|
755 |
-
"state": {
|
756 |
-
"description_width": ""
|
757 |
-
}
|
758 |
-
},
|
759 |
-
"4d9ae3c7a72a4f4aa5974fb0649cb42c": {
|
760 |
-
"model_module": "@jupyter-widgets/controls",
|
761 |
-
"model_module_version": "2.0.0",
|
762 |
-
"model_name": "HTMLStyleModel",
|
763 |
-
"state": {
|
764 |
-
"description_width": "",
|
765 |
-
"font_size": null,
|
766 |
-
"text_color": null
|
767 |
-
}
|
768 |
-
},
|
769 |
-
"54e4f39d398f45ceb760107e5b57744a": {
|
770 |
-
"model_module": "@jupyter-widgets/controls",
|
771 |
-
"model_module_version": "2.0.0",
|
772 |
-
"model_name": "HBoxModel",
|
773 |
-
"state": {
|
774 |
-
"children": [
|
775 |
-
"IPY_MODEL_2d2b267cd60649cdb6fcce93640ba8d6",
|
776 |
-
"IPY_MODEL_575f3681680a4cbeb1f95547a40bdc93",
|
777 |
-
"IPY_MODEL_91cbef62c3b84632949a24dbad475b10"
|
778 |
-
],
|
779 |
-
"layout": "IPY_MODEL_f2feb8c3b4cc4ee29091b9aab78ff4aa"
|
780 |
-
}
|
781 |
-
},
|
782 |
-
"575f3681680a4cbeb1f95547a40bdc93": {
|
783 |
-
"model_module": "@jupyter-widgets/controls",
|
784 |
-
"model_module_version": "2.0.0",
|
785 |
-
"model_name": "FloatProgressModel",
|
786 |
-
"state": {
|
787 |
-
"bar_style": "success",
|
788 |
-
"layout": "IPY_MODEL_dcc805dd65774cd2b863c2c4bb8f3f1c",
|
789 |
-
"max": 437977072,
|
790 |
-
"style": "IPY_MODEL_40666b0d750d4caf8fbaeeef11eb58c1",
|
791 |
-
"value": 437977072
|
792 |
-
}
|
793 |
-
},
|
794 |
-
"91cbef62c3b84632949a24dbad475b10": {
|
795 |
-
"model_module": "@jupyter-widgets/controls",
|
796 |
-
"model_module_version": "2.0.0",
|
797 |
-
"model_name": "HTMLModel",
|
798 |
-
"state": {
|
799 |
-
"layout": "IPY_MODEL_fe68949bcf9b42508368dd03f6506d57",
|
800 |
-
"style": "IPY_MODEL_4d9ae3c7a72a4f4aa5974fb0649cb42c",
|
801 |
-
"value": " 438M/438M [00:36<00:00, 12.1MB/s]"
|
802 |
-
}
|
803 |
-
},
|
804 |
-
"b3c2c88f904a424c96704cc4b9514f98": {
|
805 |
-
"model_module": "@jupyter-widgets/base",
|
806 |
-
"model_module_version": "2.0.0",
|
807 |
-
"model_name": "LayoutModel",
|
808 |
-
"state": {}
|
809 |
-
},
|
810 |
-
"dcc805dd65774cd2b863c2c4bb8f3f1c": {
|
811 |
-
"model_module": "@jupyter-widgets/base",
|
812 |
-
"model_module_version": "2.0.0",
|
813 |
-
"model_name": "LayoutModel",
|
814 |
-
"state": {}
|
815 |
-
},
|
816 |
-
"f2feb8c3b4cc4ee29091b9aab78ff4aa": {
|
817 |
-
"model_module": "@jupyter-widgets/base",
|
818 |
-
"model_module_version": "2.0.0",
|
819 |
-
"model_name": "LayoutModel",
|
820 |
-
"state": {}
|
821 |
-
},
|
822 |
-
"fe68949bcf9b42508368dd03f6506d57": {
|
823 |
-
"model_module": "@jupyter-widgets/base",
|
824 |
-
"model_module_version": "2.0.0",
|
825 |
-
"model_name": "LayoutModel",
|
826 |
-
"state": {}
|
827 |
-
}
|
828 |
-
},
|
829 |
"version_major": 2,
|
830 |
"version_minor": 0
|
831 |
}
|
|
|
10 |
},
|
11 |
{
|
12 |
"cell_type": "code",
|
13 |
+
"execution_count": 2,
|
14 |
"id": "73e72549-69f2-46b5-b0f5-655777139972",
|
15 |
"metadata": {
|
16 |
"execution": {
|
17 |
+
"iopub.execute_input": "2025-01-24T18:02:16.124498Z",
|
18 |
+
"iopub.status.busy": "2025-01-24T18:02:16.123394Z",
|
19 |
+
"iopub.status.idle": "2025-01-24T18:02:19.646958Z",
|
20 |
+
"shell.execute_reply": "2025-01-24T18:02:19.646675Z",
|
21 |
+
"shell.execute_reply.started": "2025-01-24T18:02:16.124448Z"
|
22 |
}
|
23 |
},
|
24 |
"outputs": [],
|
|
|
727 |
},
|
728 |
"widgets": {
|
729 |
"application/vnd.jupyter.widget-state+json": {
|
730 |
+
"state": {},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
731 |
"version_major": 2,
|
732 |
"version_minor": 0
|
733 |
}
|
tasks/text.py
CHANGED
@@ -5,6 +5,7 @@ from sklearn.metrics import accuracy_score
|
|
5 |
import numpy as np
|
6 |
import random
|
7 |
import torch
|
|
|
8 |
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
|
9 |
|
10 |
from .utils.evaluation import TextEvaluationRequest
|
@@ -14,21 +15,39 @@ router = APIRouter()
|
|
14 |
|
15 |
DESCRIPTIONS = {
|
16 |
"baseline": "baseline most common class",
|
17 |
-
"bert-base": "bert base
|
18 |
-
"bert-medium": "
|
19 |
-
"bert-small": "
|
20 |
-
"bert-mini": "
|
21 |
-
"bert-tiny": "bert tiny
|
22 |
}
|
23 |
|
24 |
ROUTE = "/text"
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def baseline_model(dataset_length: int):
|
28 |
# Make random predictions (placeholder for actual model inference)
|
29 |
# predictions = [random.randint(0, 7) for _ in range(dataset_length)]
|
30 |
|
31 |
-
# My
|
32 |
predictions = [0] * dataset_length
|
33 |
|
34 |
return predictions
|
@@ -39,6 +58,7 @@ def bert_model(test_dataset: dict, model_type: str):
|
|
39 |
texts = test_dataset["quote"]
|
40 |
|
41 |
model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
|
|
|
42 |
config = AutoConfig.from_pretrained(model_repo)
|
43 |
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
|
44 |
tokenizer = AutoTokenizer.from_pretrained(model_repo)
|
@@ -47,24 +67,22 @@ def bert_model(test_dataset: dict, model_type: str):
|
|
47 |
device = torch.device("cuda")
|
48 |
else:
|
49 |
device = torch.device("cpu")
|
50 |
-
print("device:", device)
|
51 |
model = model.to(device)
|
52 |
-
|
53 |
-
|
54 |
-
truncation=True,
|
55 |
-
padding=True,
|
56 |
-
return_tensors="pt",
|
57 |
-
)
|
58 |
-
|
59 |
model.eval()
|
60 |
with torch.no_grad():
|
61 |
-
test_input_ids = test_encoding["input_ids"].to(device)
|
62 |
-
test_attention_mask = test_encoding["attention_mask"].to(device)
|
63 |
print("Starting model run.")
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
print("End of model run.")
|
66 |
-
predictions = torch.argmax(outputs.logits, dim=1)
|
67 |
-
predictions = predictions.cpu().numpy()
|
68 |
|
69 |
print("End of my code block.")
|
70 |
return predictions
|
|
|
5 |
import numpy as np
|
6 |
import random
|
7 |
import torch
|
8 |
+
from torch.utils.data import Dataset, DataLoader
|
9 |
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
|
10 |
|
11 |
from .utils.evaluation import TextEvaluationRequest
|
|
|
15 |
|
16 |
DESCRIPTIONS = {
|
17 |
"baseline": "baseline most common class",
|
18 |
+
"bert-base": "bert base fine tuned on just training data, Nvidia T4 small",
|
19 |
+
"bert-medium": "bert medium fine tuned on just training data, Nvidia T4 small",
|
20 |
+
"bert-small": "bert small fine tuned on just training data, Nvidia T4 small",
|
21 |
+
"bert-mini": "bert min fine tuned on just training data, Nvidia T4 small",
|
22 |
+
"bert-tiny": "bert tiny fine tuned on just training data, Nvidia T4 small",
|
23 |
}
|
24 |
|
25 |
ROUTE = "/text"
|
26 |
|
27 |
|
28 |
+
class TextDataset(Dataset):
|
29 |
+
def __init__(self, texts, tokenizer, max_length=256):
|
30 |
+
self.encodings = tokenizer(
|
31 |
+
texts,
|
32 |
+
truncation=True,
|
33 |
+
padding=True,
|
34 |
+
max_length=max_length,
|
35 |
+
return_tensors="pt",
|
36 |
+
)
|
37 |
+
|
38 |
+
def __getitem__(self, idx):
|
39 |
+
item = {key: val[idx] for key, val in self.encodings.items()}
|
40 |
+
return item
|
41 |
+
|
42 |
+
def __len__(self) -> int:
|
43 |
+
return len(self.texts)
|
44 |
+
|
45 |
+
|
46 |
def baseline_model(dataset_length: int):
|
47 |
# Make random predictions (placeholder for actual model inference)
|
48 |
# predictions = [random.randint(0, 7) for _ in range(dataset_length)]
|
49 |
|
50 |
+
# My favorite baseline is the most common class.
|
51 |
predictions = [0] * dataset_length
|
52 |
|
53 |
return predictions
|
|
|
58 |
texts = test_dataset["quote"]
|
59 |
|
60 |
model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
|
61 |
+
print(f"Loading from model_repo: {model_repo}")
|
62 |
config = AutoConfig.from_pretrained(model_repo)
|
63 |
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
|
64 |
tokenizer = AutoTokenizer.from_pretrained(model_repo)
|
|
|
67 |
device = torch.device("cuda")
|
68 |
else:
|
69 |
device = torch.device("cpu")
|
70 |
+
print("Using device:", device)
|
71 |
model = model.to(device)
|
72 |
+
dataset = TextDataset(texts, tokenizer=tokenizer)
|
73 |
+
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
|
|
|
|
|
|
|
|
|
|
|
74 |
model.eval()
|
75 |
with torch.no_grad():
|
|
|
|
|
76 |
print("Starting model run.")
|
77 |
+
predictions = np.array([])
|
78 |
+
for batch in dataloader:
|
79 |
+
print(" Running a batch.")
|
80 |
+
test_input_ids = batch["input_ids"].to(device)
|
81 |
+
test_attention_mask = batch["attention_mask"].to(device)
|
82 |
+
outputs = model(test_input_ids, test_attention_mask)
|
83 |
+
p = torch.argmax(outputs.logits, dim=1)
|
84 |
+
predictions = np.append(predictions, p.cpu().numpy())
|
85 |
print("End of model run.")
|
|
|
|
|
86 |
|
87 |
print("End of my code block.")
|
88 |
return predictions
|