Nonnormalizable commited on
Commit
6c2e610
·
1 Parent(s): 80df7c4

DataLoader on inference.

Browse files
Files changed (2) hide show
  1. Finetune BERT.ipynb +7 -105
  2. tasks/text.py +37 -19
Finetune BERT.ipynb CHANGED
@@ -10,15 +10,15 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 1,
14
  "id": "73e72549-69f2-46b5-b0f5-655777139972",
15
  "metadata": {
16
  "execution": {
17
- "iopub.execute_input": "2025-01-22T18:16:12.117877Z",
18
- "iopub.status.busy": "2025-01-22T18:16:12.117575Z",
19
- "iopub.status.idle": "2025-01-22T18:16:15.083870Z",
20
- "shell.execute_reply": "2025-01-22T18:16:15.083640Z",
21
- "shell.execute_reply.started": "2025-01-22T18:16:12.117851Z"
22
  }
23
  },
24
  "outputs": [],
@@ -727,105 +727,7 @@
727
  },
728
  "widgets": {
729
  "application/vnd.jupyter.widget-state+json": {
730
- "state": {
731
- "2d2b267cd60649cdb6fcce93640ba8d6": {
732
- "model_module": "@jupyter-widgets/controls",
733
- "model_module_version": "2.0.0",
734
- "model_name": "HTMLModel",
735
- "state": {
736
- "layout": "IPY_MODEL_b3c2c88f904a424c96704cc4b9514f98",
737
- "style": "IPY_MODEL_337bc700fce14480a640a1ae545db5f5",
738
- "value": "model.safetensors: 100%"
739
- }
740
- },
741
- "337bc700fce14480a640a1ae545db5f5": {
742
- "model_module": "@jupyter-widgets/controls",
743
- "model_module_version": "2.0.0",
744
- "model_name": "HTMLStyleModel",
745
- "state": {
746
- "description_width": "",
747
- "font_size": null,
748
- "text_color": null
749
- }
750
- },
751
- "40666b0d750d4caf8fbaeeef11eb58c1": {
752
- "model_module": "@jupyter-widgets/controls",
753
- "model_module_version": "2.0.0",
754
- "model_name": "ProgressStyleModel",
755
- "state": {
756
- "description_width": ""
757
- }
758
- },
759
- "4d9ae3c7a72a4f4aa5974fb0649cb42c": {
760
- "model_module": "@jupyter-widgets/controls",
761
- "model_module_version": "2.0.0",
762
- "model_name": "HTMLStyleModel",
763
- "state": {
764
- "description_width": "",
765
- "font_size": null,
766
- "text_color": null
767
- }
768
- },
769
- "54e4f39d398f45ceb760107e5b57744a": {
770
- "model_module": "@jupyter-widgets/controls",
771
- "model_module_version": "2.0.0",
772
- "model_name": "HBoxModel",
773
- "state": {
774
- "children": [
775
- "IPY_MODEL_2d2b267cd60649cdb6fcce93640ba8d6",
776
- "IPY_MODEL_575f3681680a4cbeb1f95547a40bdc93",
777
- "IPY_MODEL_91cbef62c3b84632949a24dbad475b10"
778
- ],
779
- "layout": "IPY_MODEL_f2feb8c3b4cc4ee29091b9aab78ff4aa"
780
- }
781
- },
782
- "575f3681680a4cbeb1f95547a40bdc93": {
783
- "model_module": "@jupyter-widgets/controls",
784
- "model_module_version": "2.0.0",
785
- "model_name": "FloatProgressModel",
786
- "state": {
787
- "bar_style": "success",
788
- "layout": "IPY_MODEL_dcc805dd65774cd2b863c2c4bb8f3f1c",
789
- "max": 437977072,
790
- "style": "IPY_MODEL_40666b0d750d4caf8fbaeeef11eb58c1",
791
- "value": 437977072
792
- }
793
- },
794
- "91cbef62c3b84632949a24dbad475b10": {
795
- "model_module": "@jupyter-widgets/controls",
796
- "model_module_version": "2.0.0",
797
- "model_name": "HTMLModel",
798
- "state": {
799
- "layout": "IPY_MODEL_fe68949bcf9b42508368dd03f6506d57",
800
- "style": "IPY_MODEL_4d9ae3c7a72a4f4aa5974fb0649cb42c",
801
- "value": " 438M/438M [00:36<00:00, 12.1MB/s]"
802
- }
803
- },
804
- "b3c2c88f904a424c96704cc4b9514f98": {
805
- "model_module": "@jupyter-widgets/base",
806
- "model_module_version": "2.0.0",
807
- "model_name": "LayoutModel",
808
- "state": {}
809
- },
810
- "dcc805dd65774cd2b863c2c4bb8f3f1c": {
811
- "model_module": "@jupyter-widgets/base",
812
- "model_module_version": "2.0.0",
813
- "model_name": "LayoutModel",
814
- "state": {}
815
- },
816
- "f2feb8c3b4cc4ee29091b9aab78ff4aa": {
817
- "model_module": "@jupyter-widgets/base",
818
- "model_module_version": "2.0.0",
819
- "model_name": "LayoutModel",
820
- "state": {}
821
- },
822
- "fe68949bcf9b42508368dd03f6506d57": {
823
- "model_module": "@jupyter-widgets/base",
824
- "model_module_version": "2.0.0",
825
- "model_name": "LayoutModel",
826
- "state": {}
827
- }
828
- },
829
  "version_major": 2,
830
  "version_minor": 0
831
  }
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 2,
14
  "id": "73e72549-69f2-46b5-b0f5-655777139972",
15
  "metadata": {
16
  "execution": {
17
+ "iopub.execute_input": "2025-01-24T18:02:16.124498Z",
18
+ "iopub.status.busy": "2025-01-24T18:02:16.123394Z",
19
+ "iopub.status.idle": "2025-01-24T18:02:19.646958Z",
20
+ "shell.execute_reply": "2025-01-24T18:02:19.646675Z",
21
+ "shell.execute_reply.started": "2025-01-24T18:02:16.124448Z"
22
  }
23
  },
24
  "outputs": [],
 
727
  },
728
  "widgets": {
729
  "application/vnd.jupyter.widget-state+json": {
730
+ "state": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  "version_major": 2,
732
  "version_minor": 0
733
  }
tasks/text.py CHANGED
@@ -5,6 +5,7 @@ from sklearn.metrics import accuracy_score
5
  import numpy as np
6
  import random
7
  import torch
 
8
  from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
9
 
10
  from .utils.evaluation import TextEvaluationRequest
@@ -14,21 +15,39 @@ router = APIRouter()
14
 
15
  DESCRIPTIONS = {
16
  "baseline": "baseline most common class",
17
- "bert-base": "bert base finetuned",
18
- "bert-medium": "to be implemented",
19
- "bert-small": "to be implemented",
20
- "bert-mini": "to be implemented",
21
- "bert-tiny": "bert tiny finetuned",
22
  }
23
 
24
  ROUTE = "/text"
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def baseline_model(dataset_length: int):
28
  # Make random predictions (placeholder for actual model inference)
29
  # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
30
 
31
- # My favorate baseline is the most common class.
32
  predictions = [0] * dataset_length
33
 
34
  return predictions
@@ -39,6 +58,7 @@ def bert_model(test_dataset: dict, model_type: str):
39
  texts = test_dataset["quote"]
40
 
41
  model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
 
42
  config = AutoConfig.from_pretrained(model_repo)
43
  model = AutoModelForSequenceClassification.from_pretrained(model_repo)
44
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
@@ -47,24 +67,22 @@ def bert_model(test_dataset: dict, model_type: str):
47
  device = torch.device("cuda")
48
  else:
49
  device = torch.device("cpu")
50
- print("device:", device)
51
  model = model.to(device)
52
- test_encoding = tokenizer(
53
- texts,
54
- truncation=True,
55
- padding=True,
56
- return_tensors="pt",
57
- )
58
-
59
  model.eval()
60
  with torch.no_grad():
61
- test_input_ids = test_encoding["input_ids"].to(device)
62
- test_attention_mask = test_encoding["attention_mask"].to(device)
63
  print("Starting model run.")
64
- outputs = model(test_input_ids, test_attention_mask)
 
 
 
 
 
 
 
65
  print("End of model run.")
66
- predictions = torch.argmax(outputs.logits, dim=1)
67
- predictions = predictions.cpu().numpy()
68
 
69
  print("End of my code block.")
70
  return predictions
 
5
  import numpy as np
6
  import random
7
  import torch
8
+ from torch.utils.data import Dataset, DataLoader
9
  from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
10
 
11
  from .utils.evaluation import TextEvaluationRequest
 
15
 
16
  DESCRIPTIONS = {
17
  "baseline": "baseline most common class",
18
+ "bert-base": "bert base fine tuned on just training data, Nvidia T4 small",
19
+ "bert-medium": "bert medium fine tuned on just training data, Nvidia T4 small",
20
+ "bert-small": "bert small fine tuned on just training data, Nvidia T4 small",
21
+ "bert-mini": "bert min fine tuned on just training data, Nvidia T4 small",
22
+ "bert-tiny": "bert tiny fine tuned on just training data, Nvidia T4 small",
23
  }
24
 
25
  ROUTE = "/text"
26
 
27
 
28
+ class TextDataset(Dataset):
29
+ def __init__(self, texts, tokenizer, max_length=256):
30
+ self.encodings = tokenizer(
31
+ texts,
32
+ truncation=True,
33
+ padding=True,
34
+ max_length=max_length,
35
+ return_tensors="pt",
36
+ )
37
+
38
+ def __getitem__(self, idx):
39
+ item = {key: val[idx] for key, val in self.encodings.items()}
40
+ return item
41
+
42
+ def __len__(self) -> int:
43
+ return len(self.texts)
44
+
45
+
46
  def baseline_model(dataset_length: int):
47
  # Make random predictions (placeholder for actual model inference)
48
  # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
49
 
50
+ # My favorite baseline is the most common class.
51
  predictions = [0] * dataset_length
52
 
53
  return predictions
 
58
  texts = test_dataset["quote"]
59
 
60
  model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
61
+ print(f"Loading from model_repo: {model_repo}")
62
  config = AutoConfig.from_pretrained(model_repo)
63
  model = AutoModelForSequenceClassification.from_pretrained(model_repo)
64
  tokenizer = AutoTokenizer.from_pretrained(model_repo)
 
67
  device = torch.device("cuda")
68
  else:
69
  device = torch.device("cpu")
70
+ print("Using device:", device)
71
  model = model.to(device)
72
+ dataset = TextDataset(texts, tokenizer=tokenizer)
73
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
 
 
 
 
 
74
  model.eval()
75
  with torch.no_grad():
 
 
76
  print("Starting model run.")
77
+ predictions = np.array([])
78
+ for batch in dataloader:
79
+ print(" Running a batch.")
80
+ test_input_ids = batch["input_ids"].to(device)
81
+ test_attention_mask = batch["attention_mask"].to(device)
82
+ outputs = model(test_input_ids, test_attention_mask)
83
+ p = torch.argmax(outputs.logits, dim=1)
84
+ predictions = np.append(predictions, p.cpu().numpy())
85
  print("End of model run.")
 
 
86
 
87
  print("End of my code block.")
88
  return predictions