text-Nonnormalizable

Sleeping

App Files Files Community

Nonnormalizable commited on Jan 21

Commit

7bc734f

1 Parent(s): 250d2de

Training bert-tiny. More integratoin with model card data.

Browse files

Files changed (1) hide show

Finetune BERT.ipynb +163 -462

Finetune BERT.ipynb CHANGED Viewed

@@ -14,11 +14,11 @@
    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:03.803583Z",
-     "iopub.status.busy": "2025-01-20T20:17:03.803051Z",
-     "iopub.status.idle": "2025-01-20T20:17:06.786959Z",
-     "shell.execute_reply": "2025-01-20T20:17:06.786718Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:03.803542Z"
     }
    },
    "outputs": [],
@@ -45,11 +45,11 @@
    "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:06.787691Z",
-     "iopub.status.busy": "2025-01-20T20:17:06.787547Z",
-     "iopub.status.idle": "2025-01-20T20:17:06.789420Z",
-     "shell.execute_reply": "2025-01-20T20:17:06.789211Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:06.787682Z"
     }
    },
    "outputs": [],
@@ -71,11 +71,11 @@
    "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:06.789829Z",
-     "iopub.status.busy": "2025-01-20T20:17:06.789761Z",
-     "iopub.status.idle": "2025-01-20T20:17:06.794443Z",
-     "shell.execute_reply": "2025-01-20T20:17:06.794260Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:06.789822Z"
     }
    },
    "outputs": [],
@@ -109,7 +109,7 @@
     "        avg_loss = total_loss / len(dataloader)\n",
     "        avg_acc = total_correct / total_length\n",
     "    model.train()\n",
-    "    return avg_loss, avg_acc\n",
     "\n",
     "\n",
     "def print_model_status(epoch, num_epochs, model, train_dataloader, test_dataloader):\n",
@@ -117,7 +117,14 @@
     "    test_loss, test_acc = model_metrics(model, test_dataloader)\n",
     "    loss_str = f\"Loss: Train {train_loss:0.3f}, Test {test_loss:0.3f}\"\n",
     "    acc_str = f\"Acc: Train {train_acc:0.3f}, Test {test_acc:0.3f}\"\n",
-    "    my_print(f\"Epoch {epoch+1}/{num_epochs} done. {loss_str}; and {acc_str}\")\n",
     "\n",
     "\n",
     "class BertClassifier(nn.Module, PyTorchModelHubMixin):\n",
@@ -136,7 +143,7 @@
     "\n",
     "\n",
     "class TextDataset(Dataset):\n",
-    "    def __init__(self, texts, labels, tokenizer, max_length=512):\n",
     "        self.encodings = tokenizer(\n",
     "            texts,\n",
     "            truncation=True,\n",
@@ -160,7 +167,7 @@
     "    criterion = nn.CrossEntropyLoss()\n",
     "    model.train()\n",
     "\n",
-    "    print_model_status(-1, num_epochs, model, train_dataloader, test_dataloader)\n",
     "    for epoch in range(num_epochs):\n",
     "        total_loss = 0\n",
     "        for batch in train_dataloader:\n",
@@ -178,7 +185,10 @@
     "\n",
     "            total_loss += loss.item()\n",
     "        avg_loss = total_loss / len(train_dataloader)\n",
-    "        print_model_status(epoch, num_epochs, model, train_dataloader, test_dataloader)"
    ]
   },
   {
@@ -187,11 +197,11 @@
    "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:06.795335Z",
-     "iopub.status.busy": "2025-01-20T20:17:06.795239Z",
-     "iopub.status.idle": "2025-01-20T20:17:06.821293Z",
-     "shell.execute_reply": "2025-01-20T20:17:06.821061Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:06.795328Z"
     }
    },
    "outputs": [],
@@ -211,11 +221,11 @@
    "id": "695bc080-bbd7-4937-af5b-50db1c936500",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:06.821637Z",
-     "iopub.status.busy": "2025-01-20T20:17:06.821569Z",
-     "iopub.status.idle": "2025-01-20T20:17:06.824265Z",
-     "shell.execute_reply": "2025-01-20T20:17:06.824082Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:06.821630Z"
     }
    },
    "outputs": [],
@@ -223,10 +233,17 @@
     "def run_training(\n",
     "    max_dataset_size=16 * 200,\n",
     "    bert_variety=\"bert-base-uncased\",\n",
-    "    max_length=200,\n",
     "    num_epochs=3,\n",
     "    batch_size=32,\n",
     "):\n",
     "    hf_dataset = load_dataset(\"quotaclimat/frugalaichallenge-text-train\")\n",
     "    test_size = 0.2\n",
     "    test_seed = 42\n",
@@ -272,8 +289,10 @@
     "        text_dataset_test, batch_size=batch_size, shuffle=False\n",
     "    )\n",
     "\n",
-    "    train_model(model, dataloader_train, dataloader_test, device, num_epochs=num_epochs)\n",
-    "    return model, tokenizer"
    ]
   },
   {
@@ -302,61 +321,57 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "792fd13f-e7cc-4d90-832d-c0da15e193cd",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:06.824513Z",
-     "iopub.status.busy": "2025-01-20T20:17:06.824457Z",
-     "iopub.status.idle": "2025-01-20T20:17:14.130284Z",
-     "shell.execute_reply": "2025-01-20T20:17:14.129964Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:06.824506Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-01-20 12:17:10 Epoch 0/3 done. Loss: Train 2.111, Test 2.247; and Acc: Train 0.281, Test 0.156\n",
-      "2025-01-20 12:17:11 Epoch 1/3 done. Loss: Train 2.026, Test 2.222; and Acc: Train 0.344, Test 0.156\n",
-      "2025-01-20 12:17:12 Epoch 2/3 done. Loss: Train 1.943, Test 2.194; and Acc: Train 0.312, Test 0.156\n",
-      "2025-01-20 12:17:14 Epoch 3/3 done. Loss: Train 1.859, Test 2.159; and Acc: Train 0.344, Test 0.156\n"
-     ]
-    }
-   ],
    "source": [
-    "model, tokenizer = run_training(\n",
-    "    max_dataset_size=16 * 2,\n",
-    "    bert_variety=\"bert-base-uncased\",\n",
     "    max_length=128,\n",
-    "    num_epochs=3,\n",
     "    batch_size=32,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:14.130879Z",
-     "iopub.status.busy": "2025-01-20T20:17:14.130792Z",
-     "iopub.status.idle": "2025-01-20T20:17:14.193695Z",
-     "shell.execute_reply": "2025-01-20T20:17:14.193466Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:14.130869Z"
-    }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-01-20 12:17:14 Predictions: tensor([4, 1, 1, 1, 3, 1, 1], device='mps:0')\n"
-     ]
-    }
-   ],
    "source": [
     "model.eval()\n",
     "test_text = [\n",
@@ -373,6 +388,7 @@
     "    truncation=True,\n",
     "    padding=True,\n",
     "    return_tensors=\"pt\",\n",
     ")\n",
     "\n",
     "with torch.no_grad():\n",
@@ -392,86 +408,66 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "1d29336e-7f88-4127-afdf-2fe043e310e1",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-01-20T20:17:14.194160Z",
-     "iopub.status.busy": "2025-01-20T20:17:14.194076Z",
-     "iopub.status.idle": "2025-01-20T20:25:46.660251Z",
-     "shell.execute_reply": "2025-01-20T20:25:46.659652Z",
-     "shell.execute_reply.started": "2025-01-20T20:17:14.194152Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-01-20 12:18:02 Epoch 0/3 done. Loss: Train 2.106, Test 2.091; and Acc: Train 0.118, Test 0.135\n",
-      "2025-01-20 12:20:37 Epoch 1/3 done. Loss: Train 0.989, Test 1.114; and Acc: Train 0.647, Test 0.603\n",
-      "2025-01-20 12:23:12 Epoch 2/3 done. Loss: Train 0.584, Test 0.928; and Acc: Train 0.825, Test 0.669\n",
-      "2025-01-20 12:25:46 Epoch 3/3 done. Loss: Train 0.313, Test 0.950; and Acc: Train 0.913, Test 0.683\n"
-     ]
-    }
-   ],
    "source": [
-    "model, tokenizer = run_training(\n",
-    "    max_dataset_size=\"full\",\n",
-    "    bert_variety=\"bert-base-uncased\",\n",
-    "    max_length=128,\n",
-    "    num_epochs=3,\n",
-    "    batch_size=32,\n",
-    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "461b8f57-0c52-403a-bb69-3bc192b323bf",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:25:46.661264Z",
-     "iopub.status.busy": "2025-01-20T20:25:46.661132Z",
-     "iopub.status.idle": "2025-01-20T20:34:54.221239Z",
-     "shell.execute_reply": "2025-01-20T20:34:54.220590Z",
-     "shell.execute_reply.started": "2025-01-20T20:25:46.661249Z"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2025-01-20 12:26:34 Epoch 0/3 done. Loss: Train 2.174, Test 2.168; and Acc: Train 0.096, Test 0.094\n",
-      "2025-01-20 12:29:21 Epoch 1/3 done. Loss: Train 0.878, Test 1.033; and Acc: Train 0.712, Test 0.653\n",
-      "2025-01-20 12:32:07 Epoch 2/3 done. Loss: Train 0.458, Test 0.906; and Acc: Train 0.869, Test 0.678\n",
-      "2025-01-20 12:34:54 Epoch 3/3 done. Loss: Train 0.218, Test 0.959; and Acc: Train 0.944, Test 0.695\n"
-     ]
-    }
-   ],
    "source": [
-    "model, tokenizer = run_training(\n",
     "    max_dataset_size=\"full\",\n",
-    "    bert_variety=\"bert-base-uncased\",\n",
-    "    max_length=128,\n",
-    "    num_epochs=3,\n",
     "    batch_size=16,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "28354e8c-886a-4523-8968-8c688c13f6a3",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T20:34:54.224989Z",
-     "iopub.status.busy": "2025-01-20T20:34:54.224772Z",
-     "iopub.status.idle": "2025-01-20T20:54:07.531338Z",
-     "shell.execute_reply": "2025-01-20T20:54:07.530559Z",
-     "shell.execute_reply.started": "2025-01-20T20:34:54.224968Z"
     }
    },
    "outputs": [
@@ -479,20 +475,29 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2025-01-20 12:36:37 Epoch 0/3 done. Loss: Train 2.122, Test 2.127; and Acc: Train 0.122, Test 0.118\n",
-      "2025-01-20 12:42:26 Epoch 1/3 done. Loss: Train 0.779, Test 0.978; and Acc: Train 0.748, Test 0.652\n",
-      "2025-01-20 12:48:16 Epoch 2/3 done. Loss: Train 0.391, Test 0.884; and Acc: Train 0.897, Test 0.696\n",
-      "2025-01-20 12:54:07 Epoch 3/3 done. Loss: Train 0.154, Test 0.978; and Acc: Train 0.959, Test 0.705\n"
      ]
     }
    ],
    "source": [
-    "model, tokenizer = run_training(\n",
-    "    max_dataset_size=\"full\",\n",
-    "    bert_variety=\"bert-base-uncased\",\n",
-    "    max_length=256,\n",
-    "    num_epochs=3,\n",
-    "    batch_size=16,\n",
     ")"
    ]
   },
@@ -506,240 +511,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2025-01-20T22:10:34.055595Z",
-     "iopub.status.busy": "2025-01-20T22:10:34.054690Z",
-     "iopub.status.idle": "2025-01-20T22:10:34.083784Z",
-     "shell.execute_reply": "2025-01-20T22:10:34.083448Z",
-     "shell.execute_reply.started": "2025-01-20T22:10:34.055529Z"
     },
     "scrolled": true
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "---\n",
-      "base_model: google-bert/bert-base-uncased\n",
-      "datasets:\n",
-      "- QuotaClimat/frugalaichallenge-text-train\n",
-      "language:\n",
-      "- en\n",
-      "license: apache-2.0\n",
-      "model_name: frugal-ai-text-bert-base\n",
-      "pipeline_tag: text-classification\n",
-      "tags:\n",
-      "- model_hub_mixin\n",
-      "- pytorch_model_hub_mixin\n",
-      "- climate\n",
-      "---\n",
-      "\n",
-      "# Model Card for Model ID\n",
-      "\n",
-      "<!-- Provide a quick summary of what the model is/does. -->\n",
-      "\n",
-      "Classify text into 8 categories of climate misinformation.\n",
-      "\n",
-      "## Model Details\n",
-      "\n",
-      "### Model Description\n",
-      "\n",
-      "<!-- Provide a longer summary of what this model is. -->\n",
-      "\n",
-      "Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.\n",
-      "\n",
-      "- **Developed by:** Andre Bach\n",
-      "- **Funded by [optional]:** N/A\n",
-      "- **Shared by [optional]:** Andre Bach\n",
-      "- **Model type:** Text classification\n",
-      "- **Language(s) (NLP):** ['en']\n",
-      "- **License:** apache-2.0\n",
-      "- **Finetuned from model [optional]:** google-bert/bert-base-uncased\n",
-      "\n",
-      "### Model Sources [optional]\n",
-      "\n",
-      "<!-- Provide the basic links for the model. -->\n",
-      "\n",
-      "- **Repository:** frugal-ai-text-bert-base\n",
-      "- **Paper [optional]:** [More Information Needed]\n",
-      "- **Demo [optional]:** [More Information Needed]\n",
-      "\n",
-      "## Uses\n",
-      "\n",
-      "<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->\n",
-      "\n",
-      "### Direct Use\n",
-      "\n",
-      "<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "### Downstream Use [optional]\n",
-      "\n",
-      "<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "### Out-of-Scope Use\n",
-      "\n",
-      "<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Bias, Risks, and Limitations\n",
-      "\n",
-      "<!-- This section is meant to convey both technical and sociotechnical limitations. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "### Recommendations\n",
-      "\n",
-      "<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->\n",
-      "\n",
-      "Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.\n",
-      "\n",
-      "## How to Get Started with the Model\n",
-      "\n",
-      "Use the code below to get started with the model.\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Training Details\n",
-      "\n",
-      "### Training Data\n",
-      "\n",
-      "<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "### Training Procedure\n",
-      "\n",
-      "<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->\n",
-      "\n",
-      "#### Preprocessing [optional]\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "\n",
-      "#### Training Hyperparameters\n",
-      "\n",
-      "- **Training regime:** {'max_dataset_size': 'full', 'bert_variety': 'bert-base-uncased', 'max_length': 256, 'num_epochs': 3, 'batch_size': 16} <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->\n",
-      "\n",
-      "#### Speeds, Sizes, Times [optional]\n",
-      "\n",
-      "<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Evaluation\n",
-      "\n",
-      "<!-- This section describes the evaluation protocols and provides the results. -->\n",
-      "\n",
-      "### Testing Data, Factors & Metrics\n",
-      "\n",
-      "#### Testing Data\n",
-      "\n",
-      "<!-- This should link to a Dataset Card if possible. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "#### Factors\n",
-      "\n",
-      "<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "#### Metrics\n",
-      "\n",
-      "<!-- These are the evaluation metrics being used, ideally with a description of why. -->\n",
-      "\n",
-      "{'loss_train': 0.154, 'loss_test': 0.978, 'acc_train': 0.959, 'acc_test': 0.705}\n",
-      "\n",
-      "### Results\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "#### Summary\n",
-      "\n",
-      "\n",
-      "\n",
-      "## Model Examination [optional]\n",
-      "\n",
-      "<!-- Relevant interpretability work for the model goes here -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Environmental Impact\n",
-      "\n",
-      "<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->\n",
-      "\n",
-      "Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).\n",
-      "\n",
-      "- **Hardware Type:** [More Information Needed]\n",
-      "- **Hours used:** [More Information Needed]\n",
-      "- **Cloud Provider:** [More Information Needed]\n",
-      "- **Compute Region:** [More Information Needed]\n",
-      "- **Carbon Emitted:** [More Information Needed]\n",
-      "\n",
-      "## Technical Specifications [optional]\n",
-      "\n",
-      "### Model Architecture and Objective\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "### Compute Infrastructure\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "#### Hardware\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "#### Software\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Citation [optional]\n",
-      "\n",
-      "<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->\n",
-      "\n",
-      "**BibTeX:**\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "**APA:**\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Glossary [optional]\n",
-      "\n",
-      "<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## More Information [optional]\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Model Card Authors [optional]\n",
-      "\n",
-      "[More Information Needed]\n",
-      "\n",
-      "## Model Card Contact\n",
-      "\n",
-      "[More Information Needed]\n"
      ]
     }
    ],
    "source": [
-    "model_and_repo_name = \"frugal-ai-text-bert-base\"\n",
     "card_data = ModelCardData(\n",
     "    model_name=model_and_repo_name,\n",
-    "    base_model=\"google-bert/bert-base-uncased\",\n",
     "    license=\"apache-2.0\",\n",
     "    language=[\"en\"],\n",
     "    datasets=[\"QuotaClimat/frugalaichallenge-text-train\"],\n",
@@ -827,6 +625,7 @@
     "    truncation=True,\n",
     "    padding=True,\n",
     "    return_tensors=\"pt\",\n",
     ")\n",
     "\n",
     "with torch.no_grad():\n",
@@ -967,105 +766,7 @@
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
-    "state": {
-     "47fba054bcbc4563934b6d25ea787e43": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "5cdf8fe39a634d048f2140b3af85165f": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "6a6b93c568744ed48ba6c58f84c3d59a": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "802b81b278a34a1a9ed480ca2ae299a0": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_47fba054bcbc4563934b6d25ea787e43",
-       "style": "IPY_MODEL_cab10a06b0064a4f876d47bbd5dda288",
-       "value": "model.safetensors: 100%"
-      }
-     },
-     "80984aaf16ce41ce839cc4bd5c0ea202": {
-      "model_module": "@jupyter-widgets/base",
-      "model_module_version": "2.0.0",
-      "model_name": "LayoutModel",
-      "state": {}
-     },
-     "87a62c5c11cc43649d6ce177ab39f244": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "8b033d0c246145a082c43e73d1377035": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLModel",
-      "state": {
-       "layout": "IPY_MODEL_5cdf8fe39a634d048f2140b3af85165f",
-       "style": "IPY_MODEL_87a62c5c11cc43649d6ce177ab39f244",
-       "value": " 438M/438M [00:15&lt;00:00, 22.9MB/s]"
-      }
-     },
-     "c5eebb3e916e4c59864d29582ab336bf": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "ProgressStyleModel",
-      "state": {
-       "description_width": ""
-      }
-     },
-     "cab10a06b0064a4f876d47bbd5dda288": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HTMLStyleModel",
-      "state": {
-       "description_width": "",
-       "font_size": null,
-       "text_color": null
-      }
-     },
-     "d83e79effc3542f49c38928463bb41ec": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "FloatProgressModel",
-      "state": {
-       "bar_style": "success",
-       "layout": "IPY_MODEL_6a6b93c568744ed48ba6c58f84c3d59a",
-       "max": 437977072,
-       "style": "IPY_MODEL_c5eebb3e916e4c59864d29582ab336bf",
-       "value": 437977072
-      }
-     },
-     "fbc09ae2c5614831a2fb02fa48a44fd1": {
-      "model_module": "@jupyter-widgets/controls",
-      "model_module_version": "2.0.0",
-      "model_name": "HBoxModel",
-      "state": {
-       "children": [
-        "IPY_MODEL_802b81b278a34a1a9ed480ca2ae299a0",
-        "IPY_MODEL_d83e79effc3542f49c38928463bb41ec",
-        "IPY_MODEL_8b033d0c246145a082c43e73d1377035"
-       ],
-       "layout": "IPY_MODEL_80984aaf16ce41ce839cc4bd5c0ea202"
-      }
-     }
-    },
     "version_major": 2,
     "version_minor": 0
    }

    "id": "73e72549-69f2-46b5-b0f5-655777139972",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:48.302003Z",
+     "iopub.status.busy": "2025-01-21T19:25:48.301808Z",
+     "iopub.status.idle": "2025-01-21T19:25:50.698806Z",
+     "shell.execute_reply": "2025-01-21T19:25:50.698535Z",
+     "shell.execute_reply.started": "2025-01-21T19:25:48.301982Z"
     }
    },
    "outputs": [],
    "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:50.699344Z",
+     "iopub.status.busy": "2025-01-21T19:25:50.699200Z",
+     "iopub.status.idle": "2025-01-21T19:25:50.701241Z",
+     "shell.execute_reply": "2025-01-21T19:25:50.700993Z",
+     "shell.execute_reply.started": "2025-01-21T19:25:50.699335Z"
     }
    },
    "outputs": [],
    "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:50.701789Z",
+     "iopub.status.busy": "2025-01-21T19:25:50.701708Z",
+     "iopub.status.idle": "2025-01-21T19:25:50.707095Z",
+     "shell.execute_reply": "2025-01-21T19:25:50.706788Z",
+     "shell.execute_reply.started": "2025-01-21T19:25:50.701781Z"
     }
    },
    "outputs": [],
     "        avg_loss = total_loss / len(dataloader)\n",
     "        avg_acc = total_correct / total_length\n",
     "    model.train()\n",
+    "    return float(avg_loss), float(avg_acc)\n",
     "\n",
     "\n",
     "def print_model_status(epoch, num_epochs, model, train_dataloader, test_dataloader):\n",
     "    test_loss, test_acc = model_metrics(model, test_dataloader)\n",
     "    loss_str = f\"Loss: Train {train_loss:0.3f}, Test {test_loss:0.3f}\"\n",
     "    acc_str = f\"Acc: Train {train_acc:0.3f}, Test {test_acc:0.3f}\"\n",
+    "    my_print(f\"Epoch {epoch+1:2}/{num_epochs} done. {loss_str}; and {acc_str}\")\n",
+    "    metrics = dict(\n",
+    "        train_loss=train_loss,\n",
+    "        train_acc=train_acc,\n",
+    "        test_loss=test_loss,\n",
+    "        test_acc=test_acc,\n",
+    "    )\n",
+    "    return metrics\n",
     "\n",
     "\n",
     "class BertClassifier(nn.Module, PyTorchModelHubMixin):\n",
     "\n",
     "\n",
     "class TextDataset(Dataset):\n",
+    "    def __init__(self, texts, labels, tokenizer, max_length=256):\n",
     "        self.encodings = tokenizer(\n",
     "            texts,\n",
     "            truncation=True,\n",
     "    criterion = nn.CrossEntropyLoss()\n",
     "    model.train()\n",
     "\n",
+    "    _ = print_model_status(-1, num_epochs, model, train_dataloader, test_dataloader)\n",
     "    for epoch in range(num_epochs):\n",
     "        total_loss = 0\n",
     "        for batch in train_dataloader:\n",
     "\n",
     "            total_loss += loss.item()\n",
     "        avg_loss = total_loss / len(train_dataloader)\n",
+    "        metrics = print_model_status(\n",
+    "            epoch, num_epochs, model, train_dataloader, test_dataloader\n",
+    "        )\n",
+    "        return metrics"
    ]
   },
   {
    "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:50.707655Z",
+     "iopub.status.busy": "2025-01-21T19:25:50.707519Z",
+     "iopub.status.idle": "2025-01-21T19:25:50.718311Z",
+     "shell.execute_reply": "2025-01-21T19:25:50.718037Z",
+     "shell.execute_reply.started": "2025-01-21T19:25:50.707646Z"
     }
    },
    "outputs": [],
    "id": "695bc080-bbd7-4937-af5b-50db1c936500",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:50.718754Z",
+     "iopub.status.busy": "2025-01-21T19:25:50.718677Z",
+     "iopub.status.idle": "2025-01-21T19:25:50.721834Z",
+     "shell.execute_reply": "2025-01-21T19:25:50.721583Z",
+     "shell.execute_reply.started": "2025-01-21T19:25:50.718746Z"
     }
    },
    "outputs": [],
     "def run_training(\n",
     "    max_dataset_size=16 * 200,\n",
     "    bert_variety=\"bert-base-uncased\",\n",
+    "    max_length=256,\n",
     "    num_epochs=3,\n",
     "    batch_size=32,\n",
     "):\n",
+    "    training_regime = dict(\n",
+    "        max_dataset_size=max_dataset_size,\n",
+    "        bert_variety=bert_variety,\n",
+    "        max_length=max_length,\n",
+    "        num_epochs=num_epochs,\n",
+    "        batch_size=batch_size,\n",
+    "    )\n",
     "    hf_dataset = load_dataset(\"quotaclimat/frugalaichallenge-text-train\")\n",
     "    test_size = 0.2\n",
     "    test_seed = 42\n",
     "        text_dataset_test, batch_size=batch_size, shuffle=False\n",
     "    )\n",
     "\n",
+    "    metrics = train_model(\n",
+    "        model, dataloader_train, dataloader_test, device, num_epochs=num_epochs\n",
+    "    )\n",
+    "    return model, tokenizer, training_regime, metrics"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T19:25:50.724036Z",
+     "iopub.status.busy": "2025-01-21T19:25:50.723968Z"
     }
    },
+   "outputs": [],
    "source": [
+    "model, tokenizer, regime, metrics = run_training(\n",
+    "    max_dataset_size=16 * 10,\n",
+    "    bert_variety=\"google/bert_uncased_L-2_H-128_A-2\",\n",
     "    max_length=128,\n",
+    "    num_epochs=4,\n",
     "    batch_size=32,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
+   "id": "32abaa1b-11f4-4793-97b8-36bb2dc29d56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "regime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe108690-bcc1-4667-9f8e-907a1a8ac2ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
    "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
    },
+   "outputs": [],
    "source": [
     "model.eval()\n",
     "test_text = [\n",
     "    truncation=True,\n",
     "    padding=True,\n",
     "    return_tensors=\"pt\",\n",
+    "    max_length=256,\n",
     ")\n",
     "\n",
     "with torch.no_grad():\n",
    ]
   },
   {
+   "cell_type": "markdown",
+   "id": "6264418d-10ef-4eca-b188-2b6b7f487797",
+   "metadata": {},
    "source": [
+    "Overall top performance per model. Machine: bert-base is using an Nvidia 1xL40S, no inference time cleaverness attempted.\n",
+    "\n",
+    "[accidentally cheating bert-base by trainging on full dataset](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250117_220350.json):\\\n",
+    "acc 0.954, energy 0.736 Wh, emissions 0.272 gco2eq\n",
+    "\n",
+    "[bert-base some hp tuning](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250120_231350.json):\\\n",
+    "acc 0.707, energy 0.803 Wh, emissions 0.296 gco2eq\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df067c27-9d58-49fc-860d-ba79e5512013",
+   "metadata": {},
+   "source": [
+    "Looking at bert-tiny.\n",
+    "Scanning max_length and batch_size with num_epochs set to 3, looks like we want 256 and 16. That gets us\\\n",
+    "`2025-01-21 10:18:56 Epoch 3/3 done. Loss: Train 1.368, Test 1.432; and Acc: Train 0.499, Test 0.477`.\n",
+    "\n",
+    "Then looking at num_epochs, we saturate test set performance at 15 (~3 min), giving e.g.\\\n",
+    "`2025-01-21 10:38:30 Epoch 15/20 done. Loss: Train 0.553, Test 1.157; and Acc: Train 0.833, Test 0.595`"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 32,
+   "id": "37794952-703c-466c-9d26-ee6cb2834246",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T18:35:29.897653Z",
+     "iopub.status.busy": "2025-01-21T18:35:29.897020Z",
+     "iopub.status.idle": "2025-01-21T18:35:29.901748Z",
+     "shell.execute_reply": "2025-01-21T18:35:29.901032Z",
+     "shell.execute_reply.started": "2025-01-21T18:35:29.897609Z"
     }
    },
+   "outputs": [],
    "source": [
+    "static_hyperparams = dict(\n",
     "    max_dataset_size=\"full\",\n",
+    "    bert_variety=\"google/bert_uncased_L-2_H-128_A-2\",\n",
+    "    max_length=256,\n",
     "    batch_size=16,\n",
     ")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 34,
    "id": "28354e8c-886a-4523-8968-8c688c13f6a3",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T18:42:35.614137Z",
+     "iopub.status.busy": "2025-01-21T18:42:35.613694Z",
+     "iopub.status.idle": "2025-01-21T18:45:35.341816Z",
+     "shell.execute_reply": "2025-01-21T18:45:35.341535Z",
+     "shell.execute_reply.started": "2025-01-21T18:42:35.614111Z"
     }
    },
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "2025-01-21 10:43:44 Epoch  0/15 done. Loss: Train 2.177, Test 2.172; and Acc: Train 0.063, Test 0.071\n",
+      "2025-01-21 10:43:52 Epoch  1/15 done. Loss: Train 1.786, Test 1.823; and Acc: Train 0.383, Test 0.354\n",
+      "2025-01-21 10:44:00 Epoch  2/15 done. Loss: Train 1.579, Test 1.628; and Acc: Train 0.465, Test 0.436\n",
+      "2025-01-21 10:44:07 Epoch  3/15 done. Loss: Train 1.431, Test 1.498; and Acc: Train 0.510, Test 0.484\n",
+      "2025-01-21 10:44:14 Epoch  4/15 done. Loss: Train 1.304, Test 1.402; and Acc: Train 0.555, Test 0.515\n",
+      "2025-01-21 10:44:22 Epoch  5/15 done. Loss: Train 1.212, Test 1.339; and Acc: Train 0.585, Test 0.535\n",
+      "2025-01-21 10:44:29 Epoch  6/15 done. Loss: Train 1.128, Test 1.288; and Acc: Train 0.611, Test 0.546\n",
+      "2025-01-21 10:44:36 Epoch  7/15 done. Loss: Train 1.039, Test 1.241; and Acc: Train 0.643, Test 0.559\n",
+      "2025-01-21 10:44:44 Epoch  8/15 done. Loss: Train 1.003, Test 1.236; and Acc: Train 0.665, Test 0.555\n",
+      "2025-01-21 10:44:51 Epoch  9/15 done. Loss: Train 0.897, Test 1.183; and Acc: Train 0.708, Test 0.568\n",
+      "2025-01-21 10:44:58 Epoch 10/15 done. Loss: Train 0.852, Test 1.187; and Acc: Train 0.724, Test 0.572\n",
+      "2025-01-21 10:45:06 Epoch 11/15 done. Loss: Train 0.769, Test 1.154; and Acc: Train 0.755, Test 0.581\n",
+      "2025-01-21 10:45:13 Epoch 12/15 done. Loss: Train 0.764, Test 1.197; and Acc: Train 0.752, Test 0.573\n",
+      "2025-01-21 10:45:20 Epoch 13/15 done. Loss: Train 0.660, Test 1.153; and Acc: Train 0.797, Test 0.590\n",
+      "2025-01-21 10:45:28 Epoch 14/15 done. Loss: Train 0.588, Test 1.143; and Acc: Train 0.820, Test 0.594\n",
+      "2025-01-21 10:45:35 Epoch 15/15 done. Loss: Train 0.579, Test 1.200; and Acc: Train 0.822, Test 0.575\n"
      ]
     }
    ],
    "source": [
+    "model, tokenizer, training_regime, testing_metrics = run_training(\n",
+    "    **static_hyperparams,\n",
+    "    num_epochs=15,\n",
     ")"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 35,
    "id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
    "metadata": {
     "execution": {
+     "iopub.execute_input": "2025-01-21T18:57:29.278360Z",
+     "iopub.status.busy": "2025-01-21T18:57:29.276985Z",
+     "iopub.status.idle": "2025-01-21T18:57:29.289810Z",
+     "shell.execute_reply": "2025-01-21T18:57:29.288574Z",
+     "shell.execute_reply.started": "2025-01-21T18:57:29.278315Z"
     },
     "scrolled": true
    },
    "outputs": [
     {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax. Perhaps you forgot a comma? (3495586751.py, line 4)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;36m  Cell \u001b[0;32mIn[35], line 4\u001b[0;36m\u001b[0m\n\u001b[0;31m    base_model=static_hyperparams[],\u001b[0m\n\u001b[0m               ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax. Perhaps you forgot a comma?\n"
      ]
     }
    ],
    "source": [
+    "model_and_repo_name = \"frugal-ai-text-bert-tiny\"\n",
     "card_data = ModelCardData(\n",
     "    model_name=model_and_repo_name,\n",
+    "    base_model=static_hyperparams[\"bert_variety\"],\n",
     "    license=\"apache-2.0\",\n",
     "    language=[\"en\"],\n",
     "    datasets=[\"QuotaClimat/frugalaichallenge-text-train\"],\n",
     "    truncation=True,\n",
     "    padding=True,\n",
     "    return_tensors=\"pt\",\n",
+    "    max_length=256,\n",
     ")\n",
     "\n",
     "with torch.no_grad():\n",
   },
   "widgets": {
    "application/vnd.jupyter.widget-state+json": {
+    "state": {},
     "version_major": 2,
     "version_minor": 0
    }