Commit
·
80df7c4
1
Parent(s):
ec64986
Trained all 5 sizes of bert.
Browse files- Finetune BERT.ipynb +189 -164
Finetune BERT.ipynb
CHANGED
@@ -14,11 +14,11 @@
|
|
14 |
"id": "73e72549-69f2-46b5-b0f5-655777139972",
|
15 |
"metadata": {
|
16 |
"execution": {
|
17 |
-
"iopub.execute_input": "2025-01-
|
18 |
-
"iopub.status.busy": "2025-01-
|
19 |
-
"iopub.status.idle": "2025-01-
|
20 |
-
"shell.execute_reply": "2025-01-
|
21 |
-
"shell.execute_reply.started": "2025-01-
|
22 |
}
|
23 |
},
|
24 |
"outputs": [],
|
@@ -45,11 +45,11 @@
|
|
45 |
"id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
|
46 |
"metadata": {
|
47 |
"execution": {
|
48 |
-
"iopub.execute_input": "2025-01-
|
49 |
-
"iopub.status.busy": "2025-01-
|
50 |
-
"iopub.status.idle": "2025-01-
|
51 |
-
"shell.execute_reply": "2025-01-
|
52 |
-
"shell.execute_reply.started": "2025-01-
|
53 |
}
|
54 |
},
|
55 |
"outputs": [],
|
@@ -71,11 +71,11 @@
|
|
71 |
"id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
|
72 |
"metadata": {
|
73 |
"execution": {
|
74 |
-
"iopub.execute_input": "2025-01-
|
75 |
-
"iopub.status.busy": "2025-01-
|
76 |
-
"iopub.status.idle": "2025-01-
|
77 |
-
"shell.execute_reply": "2025-01-
|
78 |
-
"shell.execute_reply.started": "2025-01-
|
79 |
}
|
80 |
},
|
81 |
"outputs": [],
|
@@ -199,11 +199,11 @@
|
|
199 |
"id": "07131bce-23ad-4787-8622-cce401f3e5ce",
|
200 |
"metadata": {
|
201 |
"execution": {
|
202 |
-
"iopub.execute_input": "2025-01-
|
203 |
-
"iopub.status.busy": "2025-01-
|
204 |
-
"iopub.status.idle": "2025-01-
|
205 |
-
"shell.execute_reply": "2025-01-
|
206 |
-
"shell.execute_reply.started": "2025-01-
|
207 |
}
|
208 |
},
|
209 |
"outputs": [],
|
@@ -223,11 +223,11 @@
|
|
223 |
"id": "695bc080-bbd7-4937-af5b-50db1c936500",
|
224 |
"metadata": {
|
225 |
"execution": {
|
226 |
-
"iopub.execute_input": "2025-01-
|
227 |
-
"iopub.status.busy": "2025-01-
|
228 |
-
"iopub.status.idle": "2025-01-
|
229 |
-
"shell.execute_reply": "2025-01-
|
230 |
-
"shell.execute_reply.started": "2025-01-
|
231 |
}
|
232 |
},
|
233 |
"outputs": [],
|
@@ -305,6 +305,25 @@
|
|
305 |
"# Exploration"
|
306 |
]
|
307 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
{
|
309 |
"cell_type": "markdown",
|
310 |
"id": "a847135f-ce86-46a1-9c61-3459a847cb29",
|
@@ -323,15 +342,15 @@
|
|
323 |
},
|
324 |
{
|
325 |
"cell_type": "code",
|
326 |
-
"execution_count":
|
327 |
"id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
|
328 |
"metadata": {
|
329 |
"execution": {
|
330 |
-
"iopub.execute_input": "2025-01-
|
331 |
-
"iopub.status.busy": "2025-01-
|
332 |
-
"iopub.status.idle": "2025-01-
|
333 |
-
"shell.execute_reply": "2025-01-
|
334 |
-
"shell.execute_reply.started": "2025-01-
|
335 |
}
|
336 |
},
|
337 |
"outputs": [
|
@@ -339,35 +358,35 @@
|
|
339 |
"name": "stdout",
|
340 |
"output_type": "stream",
|
341 |
"text": [
|
342 |
-
"2025-01-22
|
343 |
-
"2025-01-22
|
344 |
-
"2025-01-22
|
345 |
-
"2025-01-22
|
346 |
]
|
347 |
}
|
348 |
],
|
349 |
"source": [
|
350 |
"model, tokenizer, regime, metrics = run_training(\n",
|
351 |
" max_dataset_size=16 * 100,\n",
|
352 |
-
" bert_variety
|
353 |
" max_length=128,\n",
|
354 |
" num_epochs=3,\n",
|
355 |
-
" batch_size=
|
356 |
")"
|
357 |
]
|
358 |
},
|
359 |
{
|
360 |
"cell_type": "code",
|
361 |
-
"execution_count":
|
362 |
"id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
|
363 |
"metadata": {
|
364 |
"editable": true,
|
365 |
"execution": {
|
366 |
-
"iopub.execute_input": "2025-01-
|
367 |
-
"iopub.status.busy": "2025-01-
|
368 |
-
"iopub.status.idle": "2025-01-
|
369 |
-
"shell.execute_reply": "2025-01-
|
370 |
-
"shell.execute_reply.started": "2025-01-
|
371 |
},
|
372 |
"slideshow": {
|
373 |
"slide_type": ""
|
@@ -379,7 +398,7 @@
|
|
379 |
"name": "stdout",
|
380 |
"output_type": "stream",
|
381 |
"text": [
|
382 |
-
"2025-01-22
|
383 |
]
|
384 |
}
|
385 |
],
|
@@ -426,10 +445,10 @@
|
|
426 |
"Overall top performance per model. Machine: bert-base is using an Nvidia 1xL40S, no inference time cleaverness attempted.\n",
|
427 |
"\n",
|
428 |
"[accidentally cheating bert-base by trainging on full dataset](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250117_220350.json):\\\n",
|
429 |
-
"acc 0.954, energy 0.736 Wh
|
430 |
"\n",
|
431 |
"[bert-base some hp tuning](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250120_231350.json):\\\n",
|
432 |
-
"acc 0.707, energy 0.803 Wh
|
433 |
"\n",
|
434 |
"bert-tiny, Nvidia 1xL40S:\n",
|
435 |
"\n",
|
@@ -445,8 +464,20 @@
|
|
445 |
"Scanning max_length and batch_size with num_epochs set to 3, looks like we want 256 and 16. That gets us\\\n",
|
446 |
"`2025-01-21 10:18:56 Epoch 3/3 done. Loss: Train 1.368, Test 1.432; and Acc: Train 0.499, Test 0.477`.\n",
|
447 |
"\n",
|
448 |
-
"Then looking at num_epochs, we saturate test set performance at 15 (~3
|
449 |
-
"`2025-01-21 10:38:30 Epoch 15/20 done. Loss: Train 0.553, Test 1.157; and Acc: Train 0.833, Test 0.595
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
]
|
451 |
},
|
452 |
{
|
@@ -455,18 +486,18 @@
|
|
455 |
"id": "37794952-703c-466c-9d26-ee6cb2834246",
|
456 |
"metadata": {
|
457 |
"execution": {
|
458 |
-
"iopub.execute_input": "2025-01-
|
459 |
-
"iopub.status.busy": "2025-01-
|
460 |
-
"iopub.status.idle": "2025-01-
|
461 |
-
"shell.execute_reply": "2025-01-
|
462 |
-
"shell.execute_reply.started": "2025-01-
|
463 |
}
|
464 |
},
|
465 |
"outputs": [],
|
466 |
"source": [
|
467 |
"static_hyperparams = dict(\n",
|
468 |
" max_dataset_size=\"full\",\n",
|
469 |
-
" bert_variety
|
470 |
" max_length=256,\n",
|
471 |
" batch_size=16,\n",
|
472 |
")"
|
@@ -478,11 +509,11 @@
|
|
478 |
"id": "28354e8c-886a-4523-8968-8c688c13f6a3",
|
479 |
"metadata": {
|
480 |
"execution": {
|
481 |
-
"iopub.execute_input": "2025-01-
|
482 |
-
"iopub.status.busy": "2025-01-
|
483 |
-
"iopub.status.idle": "2025-01-
|
484 |
-
"shell.execute_reply": "2025-01-
|
485 |
-
"shell.execute_reply.started": "2025-01-
|
486 |
}
|
487 |
},
|
488 |
"outputs": [
|
@@ -490,29 +521,17 @@
|
|
490 |
"name": "stdout",
|
491 |
"output_type": "stream",
|
492 |
"text": [
|
493 |
-
"2025-01-22
|
494 |
-
"2025-01-22
|
495 |
-
"2025-01-22
|
496 |
-
"2025-01-22
|
497 |
-
"2025-01-22 09:30:00 Epoch 4/15 done. Loss: Train 1.274, Test 1.362; and Acc: Train 0.555, Test 0.523\n",
|
498 |
-
"2025-01-22 09:30:07 Epoch 5/15 done. Loss: Train 1.179, Test 1.300; and Acc: Train 0.588, Test 0.540\n",
|
499 |
-
"2025-01-22 09:30:15 Epoch 6/15 done. Loss: Train 1.097, Test 1.259; and Acc: Train 0.632, Test 0.550\n",
|
500 |
-
"2025-01-22 09:30:22 Epoch 7/15 done. Loss: Train 1.026, Test 1.225; and Acc: Train 0.659, Test 0.567\n",
|
501 |
-
"2025-01-22 09:30:30 Epoch 8/15 done. Loss: Train 0.947, Test 1.196; and Acc: Train 0.683, Test 0.580\n",
|
502 |
-
"2025-01-22 09:30:37 Epoch 9/15 done. Loss: Train 0.879, Test 1.176; and Acc: Train 0.717, Test 0.586\n",
|
503 |
-
"2025-01-22 09:30:44 Epoch 10/15 done. Loss: Train 0.817, Test 1.155; and Acc: Train 0.735, Test 0.600\n",
|
504 |
-
"2025-01-22 09:30:52 Epoch 11/15 done. Loss: Train 0.757, Test 1.148; and Acc: Train 0.763, Test 0.599\n",
|
505 |
-
"2025-01-22 09:30:59 Epoch 12/15 done. Loss: Train 0.700, Test 1.139; and Acc: Train 0.786, Test 0.603\n",
|
506 |
-
"2025-01-22 09:31:07 Epoch 13/15 done. Loss: Train 0.636, Test 1.137; and Acc: Train 0.806, Test 0.599\n",
|
507 |
-
"2025-01-22 09:31:14 Epoch 14/15 done. Loss: Train 0.582, Test 1.128; and Acc: Train 0.823, Test 0.604\n",
|
508 |
-
"2025-01-22 09:31:22 Epoch 15/15 done. Loss: Train 0.535, Test 1.134; and Acc: Train 0.837, Test 0.618\n"
|
509 |
]
|
510 |
}
|
511 |
],
|
512 |
"source": [
|
513 |
"model, tokenizer, training_regime, testing_metrics = run_training(\n",
|
514 |
" **static_hyperparams,\n",
|
515 |
-
" num_epochs=
|
516 |
")"
|
517 |
]
|
518 |
},
|
@@ -526,21 +545,20 @@
|
|
526 |
},
|
527 |
{
|
528 |
"cell_type": "code",
|
529 |
-
"execution_count":
|
530 |
"id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
|
531 |
"metadata": {
|
532 |
"execution": {
|
533 |
-
"iopub.execute_input": "2025-01-
|
534 |
-
"iopub.status.busy": "2025-01-
|
535 |
-
"iopub.status.idle": "2025-01-
|
536 |
-
"shell.execute_reply": "2025-01-
|
537 |
-
"shell.execute_reply.started": "2025-01-
|
538 |
},
|
539 |
"scrolled": true
|
540 |
},
|
541 |
"outputs": [],
|
542 |
"source": [
|
543 |
-
"model_and_repo_name = \"frugal-ai-text-bert-tiny\"\n",
|
544 |
"card_data = ModelCardData(\n",
|
545 |
" model_name=model_and_repo_name,\n",
|
546 |
" base_model=static_hyperparams[\"bert_variety\"],\n",
|
@@ -552,7 +570,7 @@
|
|
552 |
")\n",
|
553 |
"card = ModelCard.from_template(\n",
|
554 |
" card_data,\n",
|
555 |
-
" model_summary
|
556 |
" model_description=\"Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.\",\n",
|
557 |
" developers=\"Andre Bach\",\n",
|
558 |
" funded_by=\"N/A\",\n",
|
@@ -568,15 +586,15 @@
|
|
568 |
},
|
569 |
{
|
570 |
"cell_type": "code",
|
571 |
-
"execution_count":
|
572 |
"id": "29d3bbf9-ab2a-48e2-a550-e16da5025720",
|
573 |
"metadata": {
|
574 |
"execution": {
|
575 |
-
"iopub.execute_input": "2025-01-
|
576 |
-
"iopub.status.busy": "2025-01-
|
577 |
-
"iopub.status.idle": "2025-01-
|
578 |
-
"shell.execute_reply": "2025-01-
|
579 |
-
"shell.execute_reply.started": "2025-01-
|
580 |
}
|
581 |
},
|
582 |
"outputs": [],
|
@@ -587,15 +605,15 @@
|
|
587 |
},
|
588 |
{
|
589 |
"cell_type": "code",
|
590 |
-
"execution_count":
|
591 |
"id": "e3b099c6-6b98-473b-8797-5032213b9fcb",
|
592 |
"metadata": {
|
593 |
"execution": {
|
594 |
-
"iopub.execute_input": "2025-01-
|
595 |
-
"iopub.status.busy": "2025-01-
|
596 |
-
"iopub.status.idle": "2025-01-
|
597 |
-
"shell.execute_reply": "2025-01-
|
598 |
-
"shell.execute_reply.started": "2025-01-
|
599 |
}
|
600 |
},
|
601 |
"outputs": [
|
@@ -603,7 +621,7 @@
|
|
603 |
"name": "stdout",
|
604 |
"output_type": "stream",
|
605 |
"text": [
|
606 |
-
"2025-01-22
|
607 |
]
|
608 |
}
|
609 |
],
|
@@ -636,32 +654,39 @@
|
|
636 |
},
|
637 |
{
|
638 |
"cell_type": "code",
|
639 |
-
"execution_count":
|
640 |
"id": "befb94b5-88bf-40fc-8b26-cf373d1256e0",
|
641 |
"metadata": {
|
642 |
"execution": {
|
643 |
-
"iopub.execute_input": "2025-01-
|
644 |
-
"iopub.status.busy": "2025-01-
|
645 |
-
"iopub.status.idle": "2025-01-
|
646 |
-
"shell.execute_reply": "2025-01-
|
647 |
-
"shell.execute_reply.started": "2025-01-
|
648 |
}
|
649 |
},
|
650 |
"outputs": [
|
651 |
{
|
652 |
-
"
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
657 |
},
|
658 |
{
|
659 |
"data": {
|
660 |
"text/plain": [
|
661 |
-
"CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-
|
662 |
]
|
663 |
},
|
664 |
-
"execution_count":
|
665 |
"metadata": {},
|
666 |
"output_type": "execute_result"
|
667 |
}
|
@@ -703,7 +728,17 @@
|
|
703 |
"widgets": {
|
704 |
"application/vnd.jupyter.widget-state+json": {
|
705 |
"state": {
|
706 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
707 |
"model_module": "@jupyter-widgets/controls",
|
708 |
"model_module_version": "2.0.0",
|
709 |
"model_name": "HTMLStyleModel",
|
@@ -713,29 +748,15 @@
|
|
713 |
"text_color": null
|
714 |
}
|
715 |
},
|
716 |
-
"
|
717 |
-
"model_module": "@jupyter-widgets/base",
|
718 |
-
"model_module_version": "2.0.0",
|
719 |
-
"model_name": "LayoutModel",
|
720 |
-
"state": {}
|
721 |
-
},
|
722 |
-
"3058e249f3a24b89a0946db9d46692cd": {
|
723 |
"model_module": "@jupyter-widgets/controls",
|
724 |
"model_module_version": "2.0.0",
|
725 |
-
"model_name": "
|
726 |
"state": {
|
727 |
-
"
|
728 |
-
"style": "IPY_MODEL_04362bf5ea1540e69a8ed37243e960fe",
|
729 |
-
"value": " 17.6M/17.6M [00:00<00:00, 30.6MB/s]"
|
730 |
}
|
731 |
},
|
732 |
-
"
|
733 |
-
"model_module": "@jupyter-widgets/base",
|
734 |
-
"model_module_version": "2.0.0",
|
735 |
-
"model_name": "LayoutModel",
|
736 |
-
"state": {}
|
737 |
-
},
|
738 |
-
"572a4d1b74044da7a90c58c311a87eff": {
|
739 |
"model_module": "@jupyter-widgets/controls",
|
740 |
"model_module_version": "2.0.0",
|
741 |
"model_name": "HTMLStyleModel",
|
@@ -745,60 +766,64 @@
|
|
745 |
"text_color": null
|
746 |
}
|
747 |
},
|
748 |
-
"
|
749 |
-
"model_module": "@jupyter-widgets/base",
|
750 |
-
"model_module_version": "2.0.0",
|
751 |
-
"model_name": "LayoutModel",
|
752 |
-
"state": {}
|
753 |
-
},
|
754 |
-
"8f403fa494c246c9af5ee00397ac6cf5": {
|
755 |
-
"model_module": "@jupyter-widgets/base",
|
756 |
-
"model_module_version": "2.0.0",
|
757 |
-
"model_name": "LayoutModel",
|
758 |
-
"state": {}
|
759 |
-
},
|
760 |
-
"916778013b8d48d9acddd42e8b874c22": {
|
761 |
"model_module": "@jupyter-widgets/controls",
|
762 |
"model_module_version": "2.0.0",
|
763 |
-
"model_name": "
|
764 |
"state": {
|
765 |
-
"
|
|
|
|
|
|
|
|
|
|
|
766 |
}
|
767 |
},
|
768 |
-
"
|
769 |
"model_module": "@jupyter-widgets/controls",
|
770 |
"model_module_version": "2.0.0",
|
771 |
"model_name": "FloatProgressModel",
|
772 |
"state": {
|
773 |
"bar_style": "success",
|
774 |
-
"layout": "
|
775 |
-
"max":
|
776 |
-
"style": "
|
777 |
-
"value":
|
778 |
}
|
779 |
},
|
780 |
-
"
|
781 |
"model_module": "@jupyter-widgets/controls",
|
782 |
"model_module_version": "2.0.0",
|
783 |
"model_name": "HTMLModel",
|
784 |
"state": {
|
785 |
-
"layout": "
|
786 |
-
"style": "
|
787 |
-
"value": "
|
788 |
}
|
789 |
},
|
790 |
-
"
|
791 |
-
"model_module": "@jupyter-widgets/
|
792 |
"model_module_version": "2.0.0",
|
793 |
-
"model_name": "
|
794 |
-
"state": {
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
802 |
}
|
803 |
},
|
804 |
"version_major": 2,
|
|
|
14 |
"id": "73e72549-69f2-46b5-b0f5-655777139972",
|
15 |
"metadata": {
|
16 |
"execution": {
|
17 |
+
"iopub.execute_input": "2025-01-22T18:16:12.117877Z",
|
18 |
+
"iopub.status.busy": "2025-01-22T18:16:12.117575Z",
|
19 |
+
"iopub.status.idle": "2025-01-22T18:16:15.083870Z",
|
20 |
+
"shell.execute_reply": "2025-01-22T18:16:15.083640Z",
|
21 |
+
"shell.execute_reply.started": "2025-01-22T18:16:12.117851Z"
|
22 |
}
|
23 |
},
|
24 |
"outputs": [],
|
|
|
45 |
"id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
|
46 |
"metadata": {
|
47 |
"execution": {
|
48 |
+
"iopub.execute_input": "2025-01-22T18:16:15.084435Z",
|
49 |
+
"iopub.status.busy": "2025-01-22T18:16:15.084268Z",
|
50 |
+
"iopub.status.idle": "2025-01-22T18:16:15.086255Z",
|
51 |
+
"shell.execute_reply": "2025-01-22T18:16:15.086031Z",
|
52 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.084427Z"
|
53 |
}
|
54 |
},
|
55 |
"outputs": [],
|
|
|
71 |
"id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
|
72 |
"metadata": {
|
73 |
"execution": {
|
74 |
+
"iopub.execute_input": "2025-01-22T18:16:15.086764Z",
|
75 |
+
"iopub.status.busy": "2025-01-22T18:16:15.086669Z",
|
76 |
+
"iopub.status.idle": "2025-01-22T18:16:15.091701Z",
|
77 |
+
"shell.execute_reply": "2025-01-22T18:16:15.091514Z",
|
78 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.086757Z"
|
79 |
}
|
80 |
},
|
81 |
"outputs": [],
|
|
|
199 |
"id": "07131bce-23ad-4787-8622-cce401f3e5ce",
|
200 |
"metadata": {
|
201 |
"execution": {
|
202 |
+
"iopub.execute_input": "2025-01-22T18:16:15.092028Z",
|
203 |
+
"iopub.status.busy": "2025-01-22T18:16:15.091969Z",
|
204 |
+
"iopub.status.idle": "2025-01-22T18:16:15.108312Z",
|
205 |
+
"shell.execute_reply": "2025-01-22T18:16:15.108075Z",
|
206 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.092021Z"
|
207 |
}
|
208 |
},
|
209 |
"outputs": [],
|
|
|
223 |
"id": "695bc080-bbd7-4937-af5b-50db1c936500",
|
224 |
"metadata": {
|
225 |
"execution": {
|
226 |
+
"iopub.execute_input": "2025-01-22T18:16:15.108777Z",
|
227 |
+
"iopub.status.busy": "2025-01-22T18:16:15.108669Z",
|
228 |
+
"iopub.status.idle": "2025-01-22T18:16:15.111839Z",
|
229 |
+
"shell.execute_reply": "2025-01-22T18:16:15.111545Z",
|
230 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.108767Z"
|
231 |
}
|
232 |
},
|
233 |
"outputs": [],
|
|
|
305 |
"# Exploration"
|
306 |
]
|
307 |
},
|
308 |
+
{
|
309 |
+
"cell_type": "code",
|
310 |
+
"execution_count": 6,
|
311 |
+
"id": "11890d3b-8bcb-4a9b-b421-5431081cca39",
|
312 |
+
"metadata": {
|
313 |
+
"execution": {
|
314 |
+
"iopub.execute_input": "2025-01-22T18:16:15.113676Z",
|
315 |
+
"iopub.status.busy": "2025-01-22T18:16:15.113576Z",
|
316 |
+
"iopub.status.idle": "2025-01-22T18:16:15.115080Z",
|
317 |
+
"shell.execute_reply": "2025-01-22T18:16:15.114867Z",
|
318 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.113668Z"
|
319 |
+
}
|
320 |
+
},
|
321 |
+
"outputs": [],
|
322 |
+
"source": [
|
323 |
+
"base_model_repo = \"google/bert_uncased_L-12_H-768_A-12\"\n",
|
324 |
+
"model_and_repo_name = \"frugal-ai-text-bert-base\""
|
325 |
+
]
|
326 |
+
},
|
327 |
{
|
328 |
"cell_type": "markdown",
|
329 |
"id": "a847135f-ce86-46a1-9c61-3459a847cb29",
|
|
|
342 |
},
|
343 |
{
|
344 |
"cell_type": "code",
|
345 |
+
"execution_count": 7,
|
346 |
"id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
|
347 |
"metadata": {
|
348 |
"execution": {
|
349 |
+
"iopub.execute_input": "2025-01-22T18:16:15.115472Z",
|
350 |
+
"iopub.status.busy": "2025-01-22T18:16:15.115400Z",
|
351 |
+
"iopub.status.idle": "2025-01-22T18:19:33.994125Z",
|
352 |
+
"shell.execute_reply": "2025-01-22T18:19:33.993854Z",
|
353 |
+
"shell.execute_reply.started": "2025-01-22T18:16:15.115464Z"
|
354 |
}
|
355 |
},
|
356 |
"outputs": [
|
|
|
358 |
"name": "stdout",
|
359 |
"output_type": "stream",
|
360 |
"text": [
|
361 |
+
"2025-01-22 13:16:38 Epoch 0/3 done. Loss: Train 2.066, Test 2.091; and Acc: Train 0.185, Test 0.157\n",
|
362 |
+
"2025-01-22 13:17:36 Epoch 1/3 done. Loss: Train 1.089, Test 1.279; and Acc: Train 0.627, Test 0.555\n",
|
363 |
+
"2025-01-22 13:18:35 Epoch 2/3 done. Loss: Train 0.624, Test 1.044; and Acc: Train 0.839, Test 0.642\n",
|
364 |
+
"2025-01-22 13:19:33 Epoch 3/3 done. Loss: Train 0.294, Test 1.047; and Acc: Train 0.928, Test 0.648\n"
|
365 |
]
|
366 |
}
|
367 |
],
|
368 |
"source": [
|
369 |
"model, tokenizer, regime, metrics = run_training(\n",
|
370 |
" max_dataset_size=16 * 100,\n",
|
371 |
+
" bert_variety=base_model_repo,\n",
|
372 |
" max_length=128,\n",
|
373 |
" num_epochs=3,\n",
|
374 |
+
" batch_size=16,\n",
|
375 |
")"
|
376 |
]
|
377 |
},
|
378 |
{
|
379 |
"cell_type": "code",
|
380 |
+
"execution_count": 8,
|
381 |
"id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
|
382 |
"metadata": {
|
383 |
"editable": true,
|
384 |
"execution": {
|
385 |
+
"iopub.execute_input": "2025-01-22T18:19:33.994637Z",
|
386 |
+
"iopub.status.busy": "2025-01-22T18:19:33.994547Z",
|
387 |
+
"iopub.status.idle": "2025-01-22T18:19:34.064925Z",
|
388 |
+
"shell.execute_reply": "2025-01-22T18:19:34.064678Z",
|
389 |
+
"shell.execute_reply.started": "2025-01-22T18:19:33.994628Z"
|
390 |
},
|
391 |
"slideshow": {
|
392 |
"slide_type": ""
|
|
|
398 |
"name": "stdout",
|
399 |
"output_type": "stream",
|
400 |
"text": [
|
401 |
+
"2025-01-22 13:19:34 Predictions: tensor([0, 0, 3, 6, 2, 4, 6], device='mps:0')\n"
|
402 |
]
|
403 |
}
|
404 |
],
|
|
|
445 |
"Overall top performance per model. Machine: bert-base is using an Nvidia 1xL40S, no inference time cleaverness attempted.\n",
|
446 |
"\n",
|
447 |
"[accidentally cheating bert-base by trainging on full dataset](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250117_220350.json):\\\n",
|
448 |
+
"acc 0.954, energy 0.736 Wh\n",
|
449 |
"\n",
|
450 |
"[bert-base some hp tuning](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250120_231350.json):\\\n",
|
451 |
+
"acc 0.707, energy 0.803 Wh\n",
|
452 |
"\n",
|
453 |
"bert-tiny, Nvidia 1xL40S:\n",
|
454 |
"\n",
|
|
|
464 |
"Scanning max_length and batch_size with num_epochs set to 3, looks like we want 256 and 16. That gets us\\\n",
|
465 |
"`2025-01-21 10:18:56 Epoch 3/3 done. Loss: Train 1.368, Test 1.432; and Acc: Train 0.499, Test 0.477`.\n",
|
466 |
"\n",
|
467 |
+
"Then looking at num_epochs, we saturate test set performance at 15 (~3 minutes), giving e.g.\\\n",
|
468 |
+
"`2025-01-21 10:38:30 Epoch 15/20 done. Loss: Train 0.553, Test 1.157; and Acc: Train 0.833, Test 0.595`\n",
|
469 |
+
"\n",
|
470 |
+
"For bert-mini, just looking at num_epochs, we choose 8\\\n",
|
471 |
+
"`2025-01-22 10:56:12 Epoch 8/20 done. Loss: Train 0.305, Test 1.090; and Acc: Train 0.920, Test 0.646`\n",
|
472 |
+
"\n",
|
473 |
+
"For bert-small, 4\\\n",
|
474 |
+
"`2025-01-22 11:39:41 Epoch 4/15 done. Loss: Train 0.301, Test 0.978; and Acc: Train 0.920, Test 0.664`\n",
|
475 |
+
"\n",
|
476 |
+
"For bert-medium, 4\\\n",
|
477 |
+
"`2025-01-22 12:09:51 Epoch 4/10 done. Loss: Train 0.294, Test 1.020; and Acc: Train 0.922, Test 0.660`\n",
|
478 |
+
"\n",
|
479 |
+
"For bert-base, 3 does happen to be correct, just checking for completeness\\\n",
|
480 |
+
"`2025-01-22 12:59:10 Epoch 3/7 done. Loss: Train 0.156, Test 0.930; and Acc: Train 0.964, Test 0.703`"
|
481 |
]
|
482 |
},
|
483 |
{
|
|
|
486 |
"id": "37794952-703c-466c-9d26-ee6cb2834246",
|
487 |
"metadata": {
|
488 |
"execution": {
|
489 |
+
"iopub.execute_input": "2025-01-22T18:19:34.065427Z",
|
490 |
+
"iopub.status.busy": "2025-01-22T18:19:34.065327Z",
|
491 |
+
"iopub.status.idle": "2025-01-22T18:19:34.066925Z",
|
492 |
+
"shell.execute_reply": "2025-01-22T18:19:34.066714Z",
|
493 |
+
"shell.execute_reply.started": "2025-01-22T18:19:34.065418Z"
|
494 |
}
|
495 |
},
|
496 |
"outputs": [],
|
497 |
"source": [
|
498 |
"static_hyperparams = dict(\n",
|
499 |
" max_dataset_size=\"full\",\n",
|
500 |
+
" bert_variety=base_model_repo,\n",
|
501 |
" max_length=256,\n",
|
502 |
" batch_size=16,\n",
|
503 |
")"
|
|
|
509 |
"id": "28354e8c-886a-4523-8968-8c688c13f6a3",
|
510 |
"metadata": {
|
511 |
"execution": {
|
512 |
+
"iopub.execute_input": "2025-01-22T18:19:34.067286Z",
|
513 |
+
"iopub.status.busy": "2025-01-22T18:19:34.067206Z",
|
514 |
+
"iopub.status.idle": "2025-01-22T18:38:14.108104Z",
|
515 |
+
"shell.execute_reply": "2025-01-22T18:38:14.107193Z",
|
516 |
+
"shell.execute_reply.started": "2025-01-22T18:19:34.067278Z"
|
517 |
}
|
518 |
},
|
519 |
"outputs": [
|
|
|
521 |
"name": "stdout",
|
522 |
"output_type": "stream",
|
523 |
"text": [
|
524 |
+
"2025-01-22 13:21:10 Epoch 0/3 done. Loss: Train 2.088, Test 2.085; and Acc: Train 0.137, Test 0.135\n",
|
525 |
+
"2025-01-22 13:26:50 Epoch 1/3 done. Loss: Train 0.780, Test 1.012; and Acc: Train 0.747, Test 0.648\n",
|
526 |
+
"2025-01-22 13:32:30 Epoch 2/3 done. Loss: Train 0.346, Test 0.890; and Acc: Train 0.904, Test 0.689\n",
|
527 |
+
"2025-01-22 13:38:14 Epoch 3/3 done. Loss: Train 0.167, Test 0.968; and Acc: Train 0.959, Test 0.691\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
]
|
529 |
}
|
530 |
],
|
531 |
"source": [
|
532 |
"model, tokenizer, training_regime, testing_metrics = run_training(\n",
|
533 |
" **static_hyperparams,\n",
|
534 |
+
" num_epochs=3,\n",
|
535 |
")"
|
536 |
]
|
537 |
},
|
|
|
545 |
},
|
546 |
{
|
547 |
"cell_type": "code",
|
548 |
+
"execution_count": 11,
|
549 |
"id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
|
550 |
"metadata": {
|
551 |
"execution": {
|
552 |
+
"iopub.execute_input": "2025-01-22T18:38:14.109094Z",
|
553 |
+
"iopub.status.busy": "2025-01-22T18:38:14.108996Z",
|
554 |
+
"iopub.status.idle": "2025-01-22T18:38:14.124982Z",
|
555 |
+
"shell.execute_reply": "2025-01-22T18:38:14.124768Z",
|
556 |
+
"shell.execute_reply.started": "2025-01-22T18:38:14.109081Z"
|
557 |
},
|
558 |
"scrolled": true
|
559 |
},
|
560 |
"outputs": [],
|
561 |
"source": [
|
|
|
562 |
"card_data = ModelCardData(\n",
|
563 |
" model_name=model_and_repo_name,\n",
|
564 |
" base_model=static_hyperparams[\"bert_variety\"],\n",
|
|
|
570 |
")\n",
|
571 |
"card = ModelCard.from_template(\n",
|
572 |
" card_data,\n",
|
573 |
+
" model_summary=f\"Classify text into 8 categories of climate misinformation using {base_model_repo}.\",\n",
|
574 |
" model_description=\"Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.\",\n",
|
575 |
" developers=\"Andre Bach\",\n",
|
576 |
" funded_by=\"N/A\",\n",
|
|
|
586 |
},
|
587 |
{
|
588 |
"cell_type": "code",
|
589 |
+
"execution_count": 12,
|
590 |
"id": "29d3bbf9-ab2a-48e2-a550-e16da5025720",
|
591 |
"metadata": {
|
592 |
"execution": {
|
593 |
+
"iopub.execute_input": "2025-01-22T18:38:14.125523Z",
|
594 |
+
"iopub.status.busy": "2025-01-22T18:38:14.125395Z",
|
595 |
+
"iopub.status.idle": "2025-01-22T18:38:14.126978Z",
|
596 |
+
"shell.execute_reply": "2025-01-22T18:38:14.126771Z",
|
597 |
+
"shell.execute_reply.started": "2025-01-22T18:38:14.125514Z"
|
598 |
}
|
599 |
},
|
600 |
"outputs": [],
|
|
|
605 |
},
|
606 |
{
|
607 |
"cell_type": "code",
|
608 |
+
"execution_count": 13,
|
609 |
"id": "e3b099c6-6b98-473b-8797-5032213b9fcb",
|
610 |
"metadata": {
|
611 |
"execution": {
|
612 |
+
"iopub.execute_input": "2025-01-22T18:38:14.127531Z",
|
613 |
+
"iopub.status.busy": "2025-01-22T18:38:14.127415Z",
|
614 |
+
"iopub.status.idle": "2025-01-22T18:38:14.157055Z",
|
615 |
+
"shell.execute_reply": "2025-01-22T18:38:14.156821Z",
|
616 |
+
"shell.execute_reply.started": "2025-01-22T18:38:14.127524Z"
|
617 |
}
|
618 |
},
|
619 |
"outputs": [
|
|
|
621 |
"name": "stdout",
|
622 |
"output_type": "stream",
|
623 |
"text": [
|
624 |
+
"2025-01-22 13:38:14 Predictions: tensor([0, 0, 3, 1, 2, 4, 6], device='mps:0')\n"
|
625 |
]
|
626 |
}
|
627 |
],
|
|
|
654 |
},
|
655 |
{
|
656 |
"cell_type": "code",
|
657 |
+
"execution_count": 14,
|
658 |
"id": "befb94b5-88bf-40fc-8b26-cf373d1256e0",
|
659 |
"metadata": {
|
660 |
"execution": {
|
661 |
+
"iopub.execute_input": "2025-01-22T18:38:14.157429Z",
|
662 |
+
"iopub.status.busy": "2025-01-22T18:38:14.157356Z",
|
663 |
+
"iopub.status.idle": "2025-01-22T18:38:53.948196Z",
|
664 |
+
"shell.execute_reply": "2025-01-22T18:38:53.947738Z",
|
665 |
+
"shell.execute_reply.started": "2025-01-22T18:38:14.157421Z"
|
666 |
}
|
667 |
},
|
668 |
"outputs": [
|
669 |
{
|
670 |
+
"data": {
|
671 |
+
"application/vnd.jupyter.widget-view+json": {
|
672 |
+
"model_id": "54e4f39d398f45ceb760107e5b57744a",
|
673 |
+
"version_major": 2,
|
674 |
+
"version_minor": 0
|
675 |
+
},
|
676 |
+
"text/plain": [
|
677 |
+
"model.safetensors: 0%| | 0.00/438M [00:00<?, ?B/s]"
|
678 |
+
]
|
679 |
+
},
|
680 |
+
"metadata": {},
|
681 |
+
"output_type": "display_data"
|
682 |
},
|
683 |
{
|
684 |
"data": {
|
685 |
"text/plain": [
|
686 |
+
"CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/46ba6471d612d348636c07c47f57d90dd14c9f74', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='46ba6471d612d348636c07c47f57d90dd14c9f74', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)"
|
687 |
]
|
688 |
},
|
689 |
+
"execution_count": 14,
|
690 |
"metadata": {},
|
691 |
"output_type": "execute_result"
|
692 |
}
|
|
|
728 |
"widgets": {
|
729 |
"application/vnd.jupyter.widget-state+json": {
|
730 |
"state": {
|
731 |
+
"2d2b267cd60649cdb6fcce93640ba8d6": {
|
732 |
+
"model_module": "@jupyter-widgets/controls",
|
733 |
+
"model_module_version": "2.0.0",
|
734 |
+
"model_name": "HTMLModel",
|
735 |
+
"state": {
|
736 |
+
"layout": "IPY_MODEL_b3c2c88f904a424c96704cc4b9514f98",
|
737 |
+
"style": "IPY_MODEL_337bc700fce14480a640a1ae545db5f5",
|
738 |
+
"value": "model.safetensors: 100%"
|
739 |
+
}
|
740 |
+
},
|
741 |
+
"337bc700fce14480a640a1ae545db5f5": {
|
742 |
"model_module": "@jupyter-widgets/controls",
|
743 |
"model_module_version": "2.0.0",
|
744 |
"model_name": "HTMLStyleModel",
|
|
|
748 |
"text_color": null
|
749 |
}
|
750 |
},
|
751 |
+
"40666b0d750d4caf8fbaeeef11eb58c1": {
|
|
|
|
|
|
|
|
|
|
|
|
|
752 |
"model_module": "@jupyter-widgets/controls",
|
753 |
"model_module_version": "2.0.0",
|
754 |
+
"model_name": "ProgressStyleModel",
|
755 |
"state": {
|
756 |
+
"description_width": ""
|
|
|
|
|
757 |
}
|
758 |
},
|
759 |
+
"4d9ae3c7a72a4f4aa5974fb0649cb42c": {
|
|
|
|
|
|
|
|
|
|
|
|
|
760 |
"model_module": "@jupyter-widgets/controls",
|
761 |
"model_module_version": "2.0.0",
|
762 |
"model_name": "HTMLStyleModel",
|
|
|
766 |
"text_color": null
|
767 |
}
|
768 |
},
|
769 |
+
"54e4f39d398f45ceb760107e5b57744a": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
"model_module": "@jupyter-widgets/controls",
|
771 |
"model_module_version": "2.0.0",
|
772 |
+
"model_name": "HBoxModel",
|
773 |
"state": {
|
774 |
+
"children": [
|
775 |
+
"IPY_MODEL_2d2b267cd60649cdb6fcce93640ba8d6",
|
776 |
+
"IPY_MODEL_575f3681680a4cbeb1f95547a40bdc93",
|
777 |
+
"IPY_MODEL_91cbef62c3b84632949a24dbad475b10"
|
778 |
+
],
|
779 |
+
"layout": "IPY_MODEL_f2feb8c3b4cc4ee29091b9aab78ff4aa"
|
780 |
}
|
781 |
},
|
782 |
+
"575f3681680a4cbeb1f95547a40bdc93": {
|
783 |
"model_module": "@jupyter-widgets/controls",
|
784 |
"model_module_version": "2.0.0",
|
785 |
"model_name": "FloatProgressModel",
|
786 |
"state": {
|
787 |
"bar_style": "success",
|
788 |
+
"layout": "IPY_MODEL_dcc805dd65774cd2b863c2c4bb8f3f1c",
|
789 |
+
"max": 437977072,
|
790 |
+
"style": "IPY_MODEL_40666b0d750d4caf8fbaeeef11eb58c1",
|
791 |
+
"value": 437977072
|
792 |
}
|
793 |
},
|
794 |
+
"91cbef62c3b84632949a24dbad475b10": {
|
795 |
"model_module": "@jupyter-widgets/controls",
|
796 |
"model_module_version": "2.0.0",
|
797 |
"model_name": "HTMLModel",
|
798 |
"state": {
|
799 |
+
"layout": "IPY_MODEL_fe68949bcf9b42508368dd03f6506d57",
|
800 |
+
"style": "IPY_MODEL_4d9ae3c7a72a4f4aa5974fb0649cb42c",
|
801 |
+
"value": " 438M/438M [00:36<00:00, 12.1MB/s]"
|
802 |
}
|
803 |
},
|
804 |
+
"b3c2c88f904a424c96704cc4b9514f98": {
|
805 |
+
"model_module": "@jupyter-widgets/base",
|
806 |
"model_module_version": "2.0.0",
|
807 |
+
"model_name": "LayoutModel",
|
808 |
+
"state": {}
|
809 |
+
},
|
810 |
+
"dcc805dd65774cd2b863c2c4bb8f3f1c": {
|
811 |
+
"model_module": "@jupyter-widgets/base",
|
812 |
+
"model_module_version": "2.0.0",
|
813 |
+
"model_name": "LayoutModel",
|
814 |
+
"state": {}
|
815 |
+
},
|
816 |
+
"f2feb8c3b4cc4ee29091b9aab78ff4aa": {
|
817 |
+
"model_module": "@jupyter-widgets/base",
|
818 |
+
"model_module_version": "2.0.0",
|
819 |
+
"model_name": "LayoutModel",
|
820 |
+
"state": {}
|
821 |
+
},
|
822 |
+
"fe68949bcf9b42508368dd03f6506d57": {
|
823 |
+
"model_module": "@jupyter-widgets/base",
|
824 |
+
"model_module_version": "2.0.0",
|
825 |
+
"model_name": "LayoutModel",
|
826 |
+
"state": {}
|
827 |
}
|
828 |
},
|
829 |
"version_major": 2,
|