frugal-ai-submission

Paused

App Files Files Community

Nonnormalizable commited on Jan 24

Commit

f718d63

1 Parent(s): 6c2e610

TextDataset bug

Browse files

Files changed (2) hide show

Finetune BERT.ipynb +73 -60
tasks/text.py +1 -0

Finetune BERT.ipynb CHANGED Viewed

@@ -10,15 +10,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-24T18:02:16.124498Z",
-     "iopub.status.busy": "2025-01-24T18:02:16.123394Z",
-     "iopub.status.idle": "2025-01-24T18:02:19.646958Z",
-     "shell.execute_reply": "2025-01-24T18:02:19.646675Z",
-     "shell.execute_reply.started": "2025-01-24T18:02:16.124448Z"
     }
    },
    "outputs": [],
@@ -45,11 +45,11 @@
    "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.084435Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.084268Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.086255Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.086031Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.084427Z"
     }
    },
    "outputs": [],
@@ -67,15 +67,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.086764Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.086669Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.091701Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.091514Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.086757Z"
     }
    },
    "outputs": [],
@@ -146,6 +146,7 @@
     "\n",
     "class TextDataset(Dataset):\n",
     "    def __init__(self, texts, labels, tokenizer, max_length=256):\n",
     "        self.encodings = tokenizer(\n",
     "            texts,\n",
     "            truncation=True,\n",
@@ -195,15 +196,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.092028Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.091969Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.108312Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.108075Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.092021Z"
     }
    },
    "outputs": [],
@@ -219,15 +220,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "695bc080-bbd7-4937-af5b-50db1c936500",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.108777Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.108669Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.111839Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.111545Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.108767Z"
     }
    },
    "outputs": [],
@@ -307,15 +308,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "11890d3b-8bcb-4a9b-b421-5431081cca39",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.113676Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.113576Z",
-     "iopub.status.idle": "2025-01-22T18:16:15.115080Z",
-     "shell.execute_reply": "2025-01-22T18:16:15.114867Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.113668Z"
     }
    },
    "outputs": [],
@@ -342,15 +343,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:16:15.115472Z",
-     "iopub.status.busy": "2025-01-22T18:16:15.115400Z",
-     "iopub.status.idle": "2025-01-22T18:19:33.994125Z",
-     "shell.execute_reply": "2025-01-22T18:19:33.993854Z",
-     "shell.execute_reply.started": "2025-01-22T18:16:15.115464Z"
     }
    },
    "outputs": [
@@ -358,10 +359,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2025-01-22 13:16:38 Epoch  0/3 done. Loss: Train 2.066, Test 2.091; and Acc: Train 0.185, Test 0.157\n",
-      "2025-01-22 13:17:36 Epoch  1/3 done. Loss: Train 1.089, Test 1.279; and Acc: Train 0.627, Test 0.555\n",
-      "2025-01-22 13:18:35 Epoch  2/3 done. Loss: Train 0.624, Test 1.044; and Acc: Train 0.839, Test 0.642\n",
-      "2025-01-22 13:19:33 Epoch  3/3 done. Loss: Train 0.294, Test 1.047; and Acc: Train 0.928, Test 0.648\n"
      ]
     }
    ],
@@ -377,31 +391,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
    "metadata": {
     "editable": true,
     "execution": {
-     "iopub.execute_input": "2025-01-22T18:19:33.994637Z",
-     "iopub.status.busy": "2025-01-22T18:19:33.994547Z",
-     "iopub.status.idle": "2025-01-22T18:19:34.064925Z",
-     "shell.execute_reply": "2025-01-22T18:19:34.064678Z",
-     "shell.execute_reply.started": "2025-01-22T18:19:33.994628Z"
     },
     "slideshow": {
      "slide_type": ""
     },
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-01-22 13:19:34 Predictions: tensor([0, 0, 3, 6, 2, 4, 6], device='mps:0')\n"
-     ]
-    }
-   ],
    "source": [
     "model.eval()\n",
     "test_text = [\n",
@@ -429,6 +434,14 @@
     "    my_print(f\"Predictions: {predictions}\")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "0c3ea938-dd87-4673-b1d6-f06c70b19455",

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:21:58.280871Z",
+     "iopub.status.busy": "2025-01-24T18:21:58.280785Z",
+     "iopub.status.idle": "2025-01-24T18:22:01.627392Z",
+     "shell.execute_reply": "2025-01-24T18:22:01.627134Z",
+     "shell.execute_reply.started": "2025-01-24T18:21:58.280861Z"
     }
    },
    "outputs": [],
    "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:22:01.628023Z",
+     "iopub.status.busy": "2025-01-24T18:22:01.627838Z",
+     "iopub.status.idle": "2025-01-24T18:22:01.629825Z",
+     "shell.execute_reply": "2025-01-24T18:22:01.629635Z",
+     "shell.execute_reply.started": "2025-01-24T18:22:01.628013Z"
     }
    },
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:23:58.768682Z",
+     "iopub.status.busy": "2025-01-24T18:23:58.768083Z",
+     "iopub.status.idle": "2025-01-24T18:23:58.787548Z",
+     "shell.execute_reply": "2025-01-24T18:23:58.786993Z",
+     "shell.execute_reply.started": "2025-01-24T18:23:58.768631Z"
     }
    },
    "outputs": [],
     "\n",
     "class TextDataset(Dataset):\n",
     "    def __init__(self, texts, labels, tokenizer, max_length=256):\n",
+    "        self.texts = texts\n",
     "        self.encodings = tokenizer(\n",
     "            texts,\n",
     "            truncation=True,\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:23:59.127835Z",
+     "iopub.status.busy": "2025-01-24T18:23:59.126787Z",
+     "iopub.status.idle": "2025-01-24T18:23:59.136440Z",
+     "shell.execute_reply": "2025-01-24T18:23:59.135267Z",
+     "shell.execute_reply.started": "2025-01-24T18:23:59.127791Z"
     }
    },
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "695bc080-bbd7-4937-af5b-50db1c936500",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:23:59.442432Z",
+     "iopub.status.busy": "2025-01-24T18:23:59.441786Z",
+     "iopub.status.idle": "2025-01-24T18:23:59.453218Z",
+     "shell.execute_reply": "2025-01-24T18:23:59.452473Z",
+     "shell.execute_reply.started": "2025-01-24T18:23:59.442367Z"
     }
    },
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "11890d3b-8bcb-4a9b-b421-5431081cca39",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:24:00.153856Z",
+     "iopub.status.busy": "2025-01-24T18:24:00.153044Z",
+     "iopub.status.idle": "2025-01-24T18:24:00.158876Z",
+     "shell.execute_reply": "2025-01-24T18:24:00.157762Z",
+     "shell.execute_reply.started": "2025-01-24T18:24:00.153804Z"
     }
    },
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-24T18:24:00.721937Z",
+     "iopub.status.busy": "2025-01-24T18:24:00.721190Z",
+     "iopub.status.idle": "2025-01-24T18:24:06.157768Z",
+     "shell.execute_reply": "2025-01-24T18:24:06.157299Z",
+     "shell.execute_reply.started": "2025-01-24T18:24:00.721894Z"
     }
    },
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "4872 1219\n",
+      "8 8\n"
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[16], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m model, tokenizer, regime, metrics \u001b[38;5;241m=\u001b[39m \u001b[43mrun_training\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_dataset_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m16\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbert_variety\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbase_model_repo\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m      4\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m128\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[43m    \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      6\u001b[0m \u001b[43m    \u001b[49m\u001b[43mbatch_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m16\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m      7\u001b[0m \u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[14], line 62\u001b[0m, in \u001b[0;36mrun_training\u001b[0;34m(max_dataset_size, bert_variety, max_length, num_epochs, batch_size)\u001b[0m\n\u001b[1;32m     55\u001b[0m dataloader_train \u001b[38;5;241m=\u001b[39m DataLoader(\n\u001b[1;32m     56\u001b[0m     text_dataset_train, batch_size\u001b[38;5;241m=\u001b[39mbatch_size, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m     57\u001b[0m )\n\u001b[1;32m     58\u001b[0m dataloader_test \u001b[38;5;241m=\u001b[39m DataLoader(\n\u001b[1;32m     59\u001b[0m     text_dataset_test, batch_size\u001b[38;5;241m=\u001b[39mbatch_size, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     60\u001b[0m )\n\u001b[0;32m---> 62\u001b[0m metrics \u001b[38;5;241m=\u001b[39m \u001b[43mtrain_model\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     63\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataloader_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataloader_test\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnum_epochs\u001b[49m\n\u001b[1;32m     64\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m model, tokenizer, training_regime, metrics\n",
+      "Cell \u001b[0;32mIn[12], line 91\u001b[0m, in \u001b[0;36mtrain_model\u001b[0;34m(model, train_dataloader, test_dataloader, device, num_epochs)\u001b[0m\n\u001b[1;32m     88\u001b[0m criterion \u001b[38;5;241m=\u001b[39m nn\u001b[38;5;241m.\u001b[39mCrossEntropyLoss()\n\u001b[1;32m     89\u001b[0m model\u001b[38;5;241m.\u001b[39mtrain()\n\u001b[0;32m---> 91\u001b[0m _ \u001b[38;5;241m=\u001b[39m \u001b[43mprint_model_status\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_epochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_dataloader\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     92\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m epoch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(num_epochs):\n\u001b[1;32m     93\u001b[0m     total_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
+      "Cell \u001b[0;32mIn[12], line 34\u001b[0m, in \u001b[0;36mprint_model_status\u001b[0;34m(epoch, num_epochs, model, train_dataloader, test_dataloader)\u001b[0m\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprint_model_status\u001b[39m(epoch, num_epochs, model, train_dataloader, test_dataloader):\n\u001b[0;32m---> 34\u001b[0m     train_loss, train_acc \u001b[38;5;241m=\u001b[39m \u001b[43mmodel_metrics\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrain_dataloader\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     35\u001b[0m     test_loss, test_acc \u001b[38;5;241m=\u001b[39m model_metrics(model, test_dataloader)\n\u001b[1;32m     36\u001b[0m     loss_str \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLoss: Train \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtrain_loss\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m0.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, Test \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtest_loss\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m0.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n",
+      "Cell \u001b[0;32mIn[12], line 20\u001b[0m, in \u001b[0;36mmodel_metrics\u001b[0;34m(model, dataloader)\u001b[0m\n\u001b[1;32m     18\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(input_ids, attention_mask)\n\u001b[1;32m     19\u001b[0m loss \u001b[38;5;241m=\u001b[39m criterion(outputs, labels)\n\u001b[0;32m---> 20\u001b[0m predictions_cpu \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margmax\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdim\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcpu\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m     21\u001b[0m labels_cpu \u001b[38;5;241m=\u001b[39m labels\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m     22\u001b[0m correct_count \u001b[38;5;241m=\u001b[39m (predictions_cpu \u001b[38;5;241m==\u001b[39m labels_cpu)\u001b[38;5;241m.\u001b[39msum()\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
    "metadata": {
     "editable": true,
     "execution": {
+     "iopub.status.busy": "2025-01-24T18:24:06.157956Z",
+     "iopub.status.idle": "2025-01-24T18:24:06.158060Z",
+     "shell.execute_reply": "2025-01-24T18:24:06.158008Z",
+     "shell.execute_reply.started": "2025-01-24T18:24:06.158002Z"
     },
     "slideshow": {
      "slide_type": ""
     },
     "tags": []
    },
+   "outputs": [],
    "source": [
     "model.eval()\n",
     "test_text = [\n",
     "    my_print(f\"Predictions: {predictions}\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1201bf29-5040-4317-be30-77bec0bfe5b4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "id": "0c3ea938-dd87-4673-b1d6-f06c70b19455",

tasks/text.py CHANGED Viewed

@@ -27,6 +27,7 @@ ROUTE = "/text"
 class TextDataset(Dataset):
     def __init__(self, texts, tokenizer, max_length=256):
         self.encodings = tokenizer(
             texts,
             truncation=True,

 class TextDataset(Dataset):
     def __init__(self, texts, tokenizer, max_length=256):
+        self.texts = texts
         self.encodings = tokenizer(
             texts,
             truncation=True,