frugal-ai-submission

Paused

App Files Files Community

Nonnormalizable commited on Jan 24

Commit

6c2e610

1 Parent(s): 80df7c4

DataLoader on inference.

Browse files

Files changed (2) hide show

Finetune BERT.ipynb +7 -105
tasks/text.py +37 -19

Finetune BERT.ipynb CHANGED Viewed

@@ -10,15 +10,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:12.117877Z",
-     "iopub.status.busy": "2025-01-22T18:16:12.117575Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.083870Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.083640Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:12.117851Z"
     }
    },
    "outputs": [],
@@ -727,105 +727,7 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "2d2b267cd60649cdb6fcce93640ba8d6": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_b3c2c88f904a424c96704cc4b9514f98",
-       "style": "IPY_MODEL_337bc700fce14480a640a1ae545db5f5",
-       "value": "model.safetensors: 100%"
-      }
-     },
-     "337bc700fce14480a640a1ae545db5f5": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "40666b0d750d4caf8fbaeeef11eb58c1": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "4d9ae3c7a72a4f4aa5974fb0649cb42c": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "54e4f39d398f45ceb760107e5b57744a": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_2d2b267cd60649cdb6fcce93640ba8d6",
-        "IPY_MODEL_575f3681680a4cbeb1f95547a40bdc93",
-        "IPY_MODEL_91cbef62c3b84632949a24dbad475b10"
-       ],
-       "layout": "IPY_MODEL_f2feb8c3b4cc4ee29091b9aab78ff4aa"
-      }
-     },
-     "575f3681680a4cbeb1f95547a40bdc93": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_dcc805dd65774cd2b863c2c4bb8f3f1c",
-       "max": 437977072,
-       "style": "IPY_MODEL_40666b0d750d4caf8fbaeeef11eb58c1",
-       "value": 437977072
-      }
-     },
-     "91cbef62c3b84632949a24dbad475b10": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_fe68949bcf9b42508368dd03f6506d57",
-       "style": "IPY_MODEL_4d9ae3c7a72a4f4aa5974fb0649cb42c",
-       "value": " 438M/438M [00:36&lt;00:00, 12.1MB/s]"
-      }
-     },
-     "b3c2c88f904a424c96704cc4b9514f98": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "dcc805dd65774cd2b863c2c4bb8f3f1c": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "f2feb8c3b4cc4ee29091b9aab78ff4aa": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "fe68949bcf9b42508368dd03f6506d57": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     }
-    },
     "version_major": 2,
     "version_minor": 0
    }

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:02:16.124498Z",
+     "iopub.status.busy": "2025-01-24T18:02:16.123394Z",
+     "iopub.status.idle": "2025-01-24T18:02:19.646958Z",
+     "shell.execute_reply": "2025-01-24T18:02:19.646675Z",
+     "shell.execute_reply.started": "2025-01-24T18:02:16.124448Z"
     }
    },
    "outputs": [],
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
+    "state": {},
     "version_major": 2,
     "version_minor": 0
    }

tasks/text.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sklearn.metrics import accuracy_score
 import numpy as np
 import random
 import torch
 from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
 from .utils.evaluation import TextEvaluationRequest
@@ -14,21 +15,39 @@ router = APIRouter()
 DESCRIPTIONS = {
     "baseline": "baseline most common class",
-    "bert-base": "bert base finetuned",
-    "bert-medium": "to be implemented",
-    "bert-small": "to be implemented",
-    "bert-mini": "to be implemented",
-    "bert-tiny": "bert tiny finetuned",
 }
 ROUTE = "/text"
 def baseline_model(dataset_length: int):
     # Make random predictions (placeholder for actual model inference)
     # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
-    # My favorate baseline is the most common class.
     predictions = [0] * dataset_length
     return predictions
@@ -39,6 +58,7 @@ def bert_model(test_dataset: dict, model_type: str):
     texts = test_dataset["quote"]
     model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
     config = AutoConfig.from_pretrained(model_repo)
     model = AutoModelForSequenceClassification.from_pretrained(model_repo)
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
@@ -47,24 +67,22 @@ def bert_model(test_dataset: dict, model_type: str):
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
-    print("device:", device)
     model = model.to(device)
-    test_encoding = tokenizer(
-        texts,
-        truncation=True,
-        padding=True,
-        return_tensors="pt",
-    )
     model.eval()
     with torch.no_grad():
-        test_input_ids = test_encoding["input_ids"].to(device)
-        test_attention_mask = test_encoding["attention_mask"].to(device)
         print("Starting model run.")
-        outputs = model(test_input_ids, test_attention_mask)
         print("End of model run.")
-        predictions = torch.argmax(outputs.logits, dim=1)
-        predictions = predictions.cpu().numpy()
     print("End of my code block.")
     return predictions

 import numpy as np
 import random
 import torch
+from torch.utils.data import Dataset, DataLoader
 from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
 from .utils.evaluation import TextEvaluationRequest
 DESCRIPTIONS = {
     "baseline": "baseline most common class",
+    "bert-base": "bert base fine tuned on just training data, Nvidia T4 small",
+    "bert-medium": "bert medium fine tuned on just training data, Nvidia T4 small",
+    "bert-small": "bert small fine tuned on just training data, Nvidia T4 small",
+    "bert-mini": "bert min fine tuned on just training data, Nvidia T4 small",
+    "bert-tiny": "bert tiny fine tuned on just training data, Nvidia T4 small",
 }
 ROUTE = "/text"
+class TextDataset(Dataset):
+    def __init__(self, texts, tokenizer, max_length=256):
+        self.encodings = tokenizer(
+            texts,
+            truncation=True,
+            padding=True,
+            max_length=max_length,
+            return_tensors="pt",
+        )
+    def __getitem__(self, idx):
+        item = {key: val[idx] for key, val in self.encodings.items()}
+        return item
+    def __len__(self) -> int:
+        return len(self.texts)
 def baseline_model(dataset_length: int):
     # Make random predictions (placeholder for actual model inference)
     # predictions = [random.randint(0, 7) for _ in range(dataset_length)]
+    # My favorite baseline is the most common class.
     predictions = [0] * dataset_length
     return predictions
     texts = test_dataset["quote"]
     model_repo = f"Nonnormalizable/frugal-ai-text-{model_type}"
+    print(f"Loading from model_repo: {model_repo}")
     config = AutoConfig.from_pretrained(model_repo)
     model = AutoModelForSequenceClassification.from_pretrained(model_repo)
     tokenizer = AutoTokenizer.from_pretrained(model_repo)
         device = torch.device("cuda")
     else:
         device = torch.device("cpu")
+    print("Using device:", device)
     model = model.to(device)
+    dataset = TextDataset(texts, tokenizer=tokenizer)
+    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
     model.eval()
     with torch.no_grad():
         print("Starting model run.")
+        predictions = np.array([])
+        for batch in dataloader:
+            print("    Running a batch.")
+            test_input_ids = batch["input_ids"].to(device)
+            test_attention_mask = batch["attention_mask"].to(device)
+            outputs = model(test_input_ids, test_attention_mask)
+            p = torch.argmax(outputs.logits, dim=1)
+            predictions = np.append(predictions, p.cpu().numpy())
         print("End of model run.")
     print("End of my code block.")
     return predictions