Nonnormalizable commited on
Commit
80df7c4
·
1 Parent(s): ec64986

Trained all 5 sizes of bert.

Browse files
Files changed (1) hide show
  1. Finetune BERT.ipynb +189 -164
Finetune BERT.ipynb CHANGED
@@ -14,11 +14,11 @@
14
  "id": "73e72549-69f2-46b5-b0f5-655777139972",
15
  "metadata": {
16
  "execution": {
17
- "iopub.execute_input": "2025-01-22T14:28:40.399621Z",
18
- "iopub.status.busy": "2025-01-22T14:28:40.398151Z",
19
- "iopub.status.idle": "2025-01-22T14:28:43.463152Z",
20
- "shell.execute_reply": "2025-01-22T14:28:43.462919Z",
21
- "shell.execute_reply.started": "2025-01-22T14:28:40.399562Z"
22
  }
23
  },
24
  "outputs": [],
@@ -45,11 +45,11 @@
45
  "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
46
  "metadata": {
47
  "execution": {
48
- "iopub.execute_input": "2025-01-22T14:28:43.463941Z",
49
- "iopub.status.busy": "2025-01-22T14:28:43.463805Z",
50
- "iopub.status.idle": "2025-01-22T14:28:43.465644Z",
51
- "shell.execute_reply": "2025-01-22T14:28:43.465423Z",
52
- "shell.execute_reply.started": "2025-01-22T14:28:43.463933Z"
53
  }
54
  },
55
  "outputs": [],
@@ -71,11 +71,11 @@
71
  "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
72
  "metadata": {
73
  "execution": {
74
- "iopub.execute_input": "2025-01-22T14:28:44.578819Z",
75
- "iopub.status.busy": "2025-01-22T14:28:44.578158Z",
76
- "iopub.status.idle": "2025-01-22T14:28:44.594834Z",
77
- "shell.execute_reply": "2025-01-22T14:28:44.594043Z",
78
- "shell.execute_reply.started": "2025-01-22T14:28:44.578767Z"
79
  }
80
  },
81
  "outputs": [],
@@ -199,11 +199,11 @@
199
  "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
200
  "metadata": {
201
  "execution": {
202
- "iopub.execute_input": "2025-01-22T14:28:45.024103Z",
203
- "iopub.status.busy": "2025-01-22T14:28:45.023645Z",
204
- "iopub.status.idle": "2025-01-22T14:28:45.056500Z",
205
- "shell.execute_reply": "2025-01-22T14:28:45.056070Z",
206
- "shell.execute_reply.started": "2025-01-22T14:28:45.024069Z"
207
  }
208
  },
209
  "outputs": [],
@@ -223,11 +223,11 @@
223
  "id": "695bc080-bbd7-4937-af5b-50db1c936500",
224
  "metadata": {
225
  "execution": {
226
- "iopub.execute_input": "2025-01-22T14:28:45.268069Z",
227
- "iopub.status.busy": "2025-01-22T14:28:45.267170Z",
228
- "iopub.status.idle": "2025-01-22T14:28:45.279492Z",
229
- "shell.execute_reply": "2025-01-22T14:28:45.278723Z",
230
- "shell.execute_reply.started": "2025-01-22T14:28:45.268003Z"
231
  }
232
  },
233
  "outputs": [],
@@ -305,6 +305,25 @@
305
  "# Exploration"
306
  ]
307
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
  {
309
  "cell_type": "markdown",
310
  "id": "a847135f-ce86-46a1-9c61-3459a847cb29",
@@ -323,15 +342,15 @@
323
  },
324
  {
325
  "cell_type": "code",
326
- "execution_count": 6,
327
  "id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
328
  "metadata": {
329
  "execution": {
330
- "iopub.execute_input": "2025-01-22T14:28:46.360995Z",
331
- "iopub.status.busy": "2025-01-22T14:28:46.360044Z",
332
- "iopub.status.idle": "2025-01-22T14:28:53.023176Z",
333
- "shell.execute_reply": "2025-01-22T14:28:53.022848Z",
334
- "shell.execute_reply.started": "2025-01-22T14:28:46.360953Z"
335
  }
336
  },
337
  "outputs": [
@@ -339,35 +358,35 @@
339
  "name": "stdout",
340
  "output_type": "stream",
341
  "text": [
342
- "2025-01-22 09:28:49 Epoch 0/3 done. Loss: Train 2.131, Test 2.135; and Acc: Train 0.118, Test 0.118\n",
343
- "2025-01-22 09:28:50 Epoch 1/3 done. Loss: Train 1.952, Test 1.978; and Acc: Train 0.281, Test 0.261\n",
344
- "2025-01-22 09:28:51 Epoch 2/3 done. Loss: Train 1.905, Test 1.943; and Acc: Train 0.304, Test 0.275\n",
345
- "2025-01-22 09:28:53 Epoch 3/3 done. Loss: Train 1.862, Test 1.904; and Acc: Train 0.321, Test 0.283\n"
346
  ]
347
  }
348
  ],
349
  "source": [
350
  "model, tokenizer, regime, metrics = run_training(\n",
351
  " max_dataset_size=16 * 100,\n",
352
- " bert_variety=\"google/bert_uncased_L-2_H-128_A-2\",\n",
353
  " max_length=128,\n",
354
  " num_epochs=3,\n",
355
- " batch_size=32,\n",
356
  ")"
357
  ]
358
  },
359
  {
360
  "cell_type": "code",
361
- "execution_count": 7,
362
  "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
363
  "metadata": {
364
  "editable": true,
365
  "execution": {
366
- "iopub.execute_input": "2025-01-22T14:28:55.671186Z",
367
- "iopub.status.busy": "2025-01-22T14:28:55.670403Z",
368
- "iopub.status.idle": "2025-01-22T14:28:55.789941Z",
369
- "shell.execute_reply": "2025-01-22T14:28:55.789679Z",
370
- "shell.execute_reply.started": "2025-01-22T14:28:55.671131Z"
371
  },
372
  "slideshow": {
373
  "slide_type": ""
@@ -379,7 +398,7 @@
379
  "name": "stdout",
380
  "output_type": "stream",
381
  "text": [
382
- "2025-01-22 09:28:55 Predictions: tensor([0, 0, 0, 0, 0, 0, 0], device='mps:0')\n"
383
  ]
384
  }
385
  ],
@@ -426,10 +445,10 @@
426
  "Overall top performance per model. Machine: bert-base is using an Nvidia 1xL40S, no inference time cleaverness attempted.\n",
427
  "\n",
428
  "[accidentally cheating bert-base by trainging on full dataset](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250117_220350.json):\\\n",
429
- "acc 0.954, energy 0.736 Wh, emissions 0.272 gco2eq\n",
430
  "\n",
431
  "[bert-base some hp tuning](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250120_231350.json):\\\n",
432
- "acc 0.707, energy 0.803 Wh, emissions 0.296 gco2eq\n",
433
  "\n",
434
  "bert-tiny, Nvidia 1xL40S:\n",
435
  "\n",
@@ -445,8 +464,20 @@
445
  "Scanning max_length and batch_size with num_epochs set to 3, looks like we want 256 and 16. That gets us\\\n",
446
  "`2025-01-21 10:18:56 Epoch 3/3 done. Loss: Train 1.368, Test 1.432; and Acc: Train 0.499, Test 0.477`.\n",
447
  "\n",
448
- "Then looking at num_epochs, we saturate test set performance at 15 (~3 min), giving e.g.\\\n",
449
- "`2025-01-21 10:38:30 Epoch 15/20 done. Loss: Train 0.553, Test 1.157; and Acc: Train 0.833, Test 0.595`"
 
 
 
 
 
 
 
 
 
 
 
 
450
  ]
451
  },
452
  {
@@ -455,18 +486,18 @@
455
  "id": "37794952-703c-466c-9d26-ee6cb2834246",
456
  "metadata": {
457
  "execution": {
458
- "iopub.execute_input": "2025-01-22T14:29:24.691783Z",
459
- "iopub.status.busy": "2025-01-22T14:29:24.691195Z",
460
- "iopub.status.idle": "2025-01-22T14:29:24.696800Z",
461
- "shell.execute_reply": "2025-01-22T14:29:24.695895Z",
462
- "shell.execute_reply.started": "2025-01-22T14:29:24.691745Z"
463
  }
464
  },
465
  "outputs": [],
466
  "source": [
467
  "static_hyperparams = dict(\n",
468
  " max_dataset_size=\"full\",\n",
469
- " bert_variety=\"google/bert_uncased_L-2_H-128_A-2\",\n",
470
  " max_length=256,\n",
471
  " batch_size=16,\n",
472
  ")"
@@ -478,11 +509,11 @@
478
  "id": "28354e8c-886a-4523-8968-8c688c13f6a3",
479
  "metadata": {
480
  "execution": {
481
- "iopub.execute_input": "2025-01-22T14:29:25.202258Z",
482
- "iopub.status.busy": "2025-01-22T14:29:25.201292Z",
483
- "iopub.status.idle": "2025-01-22T14:31:22.271954Z",
484
- "shell.execute_reply": "2025-01-22T14:31:22.271647Z",
485
- "shell.execute_reply.started": "2025-01-22T14:29:25.202215Z"
486
  }
487
  },
488
  "outputs": [
@@ -490,29 +521,17 @@
490
  "name": "stdout",
491
  "output_type": "stream",
492
  "text": [
493
- "2025-01-22 09:29:31 Epoch 0/15 done. Loss: Train 2.104, Test 2.111; and Acc: Train 0.114, Test 0.097\n",
494
- "2025-01-22 09:29:38 Epoch 1/15 done. Loss: Train 1.778, Test 1.814; and Acc: Train 0.353, Test 0.329\n",
495
- "2025-01-22 09:29:45 Epoch 2/15 done. Loss: Train 1.555, Test 1.605; and Acc: Train 0.443, Test 0.422\n",
496
- "2025-01-22 09:29:53 Epoch 3/15 done. Loss: Train 1.388, Test 1.451; and Acc: Train 0.519, Test 0.491\n",
497
- "2025-01-22 09:30:00 Epoch 4/15 done. Loss: Train 1.274, Test 1.362; and Acc: Train 0.555, Test 0.523\n",
498
- "2025-01-22 09:30:07 Epoch 5/15 done. Loss: Train 1.179, Test 1.300; and Acc: Train 0.588, Test 0.540\n",
499
- "2025-01-22 09:30:15 Epoch 6/15 done. Loss: Train 1.097, Test 1.259; and Acc: Train 0.632, Test 0.550\n",
500
- "2025-01-22 09:30:22 Epoch 7/15 done. Loss: Train 1.026, Test 1.225; and Acc: Train 0.659, Test 0.567\n",
501
- "2025-01-22 09:30:30 Epoch 8/15 done. Loss: Train 0.947, Test 1.196; and Acc: Train 0.683, Test 0.580\n",
502
- "2025-01-22 09:30:37 Epoch 9/15 done. Loss: Train 0.879, Test 1.176; and Acc: Train 0.717, Test 0.586\n",
503
- "2025-01-22 09:30:44 Epoch 10/15 done. Loss: Train 0.817, Test 1.155; and Acc: Train 0.735, Test 0.600\n",
504
- "2025-01-22 09:30:52 Epoch 11/15 done. Loss: Train 0.757, Test 1.148; and Acc: Train 0.763, Test 0.599\n",
505
- "2025-01-22 09:30:59 Epoch 12/15 done. Loss: Train 0.700, Test 1.139; and Acc: Train 0.786, Test 0.603\n",
506
- "2025-01-22 09:31:07 Epoch 13/15 done. Loss: Train 0.636, Test 1.137; and Acc: Train 0.806, Test 0.599\n",
507
- "2025-01-22 09:31:14 Epoch 14/15 done. Loss: Train 0.582, Test 1.128; and Acc: Train 0.823, Test 0.604\n",
508
- "2025-01-22 09:31:22 Epoch 15/15 done. Loss: Train 0.535, Test 1.134; and Acc: Train 0.837, Test 0.618\n"
509
  ]
510
  }
511
  ],
512
  "source": [
513
  "model, tokenizer, training_regime, testing_metrics = run_training(\n",
514
  " **static_hyperparams,\n",
515
- " num_epochs=15,\n",
516
  ")"
517
  ]
518
  },
@@ -526,21 +545,20 @@
526
  },
527
  {
528
  "cell_type": "code",
529
- "execution_count": 12,
530
  "id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
531
  "metadata": {
532
  "execution": {
533
- "iopub.execute_input": "2025-01-22T14:31:42.946851Z",
534
- "iopub.status.busy": "2025-01-22T14:31:42.946191Z",
535
- "iopub.status.idle": "2025-01-22T14:31:42.970151Z",
536
- "shell.execute_reply": "2025-01-22T14:31:42.969731Z",
537
- "shell.execute_reply.started": "2025-01-22T14:31:42.946804Z"
538
  },
539
  "scrolled": true
540
  },
541
  "outputs": [],
542
  "source": [
543
- "model_and_repo_name = \"frugal-ai-text-bert-tiny\"\n",
544
  "card_data = ModelCardData(\n",
545
  " model_name=model_and_repo_name,\n",
546
  " base_model=static_hyperparams[\"bert_variety\"],\n",
@@ -552,7 +570,7 @@
552
  ")\n",
553
  "card = ModelCard.from_template(\n",
554
  " card_data,\n",
555
- " model_summary=\"Classify text into 8 categories of climate misinformation.\",\n",
556
  " model_description=\"Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.\",\n",
557
  " developers=\"Andre Bach\",\n",
558
  " funded_by=\"N/A\",\n",
@@ -568,15 +586,15 @@
568
  },
569
  {
570
  "cell_type": "code",
571
- "execution_count": 13,
572
  "id": "29d3bbf9-ab2a-48e2-a550-e16da5025720",
573
  "metadata": {
574
  "execution": {
575
- "iopub.execute_input": "2025-01-22T14:31:44.266203Z",
576
- "iopub.status.busy": "2025-01-22T14:31:44.265638Z",
577
- "iopub.status.idle": "2025-01-22T14:31:44.271280Z",
578
- "shell.execute_reply": "2025-01-22T14:31:44.270441Z",
579
- "shell.execute_reply.started": "2025-01-22T14:31:44.266162Z"
580
  }
581
  },
582
  "outputs": [],
@@ -587,15 +605,15 @@
587
  },
588
  {
589
  "cell_type": "code",
590
- "execution_count": 14,
591
  "id": "e3b099c6-6b98-473b-8797-5032213b9fcb",
592
  "metadata": {
593
  "execution": {
594
- "iopub.execute_input": "2025-01-22T14:31:45.670794Z",
595
- "iopub.status.busy": "2025-01-22T14:31:45.670345Z",
596
- "iopub.status.idle": "2025-01-22T14:31:45.731173Z",
597
- "shell.execute_reply": "2025-01-22T14:31:45.730818Z",
598
- "shell.execute_reply.started": "2025-01-22T14:31:45.670769Z"
599
  }
600
  },
601
  "outputs": [
@@ -603,7 +621,7 @@
603
  "name": "stdout",
604
  "output_type": "stream",
605
  "text": [
606
- "2025-01-22 09:31:45 Predictions: tensor([0, 0, 3, 1, 2, 4, 6], device='mps:0')\n"
607
  ]
608
  }
609
  ],
@@ -636,32 +654,39 @@
636
  },
637
  {
638
  "cell_type": "code",
639
- "execution_count": 18,
640
  "id": "befb94b5-88bf-40fc-8b26-cf373d1256e0",
641
  "metadata": {
642
  "execution": {
643
- "iopub.execute_input": "2025-01-22T14:37:57.412327Z",
644
- "iopub.status.busy": "2025-01-22T14:37:57.411779Z",
645
- "iopub.status.idle": "2025-01-22T14:37:59.349630Z",
646
- "shell.execute_reply": "2025-01-22T14:37:59.348338Z",
647
- "shell.execute_reply.started": "2025-01-22T14:37:57.412288Z"
648
  }
649
  },
650
  "outputs": [
651
  {
652
- "name": "stderr",
653
- "output_type": "stream",
654
- "text": [
655
- "No files have been modified since last commit. Skipping to prevent empty commit.\n"
656
- ]
 
 
 
 
 
 
 
657
  },
658
  {
659
  "data": {
660
  "text/plain": [
661
- "CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-tiny/commit/69d445f90562fc738f12cfb37908fccef8925f5c', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='69d445f90562fc738f12cfb37908fccef8925f5c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-tiny', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-tiny'), pr_revision=None, pr_num=None)"
662
  ]
663
  },
664
- "execution_count": 18,
665
  "metadata": {},
666
  "output_type": "execute_result"
667
  }
@@ -703,7 +728,17 @@
703
  "widgets": {
704
  "application/vnd.jupyter.widget-state+json": {
705
  "state": {
706
- "04362bf5ea1540e69a8ed37243e960fe": {
 
 
 
 
 
 
 
 
 
 
707
  "model_module": "@jupyter-widgets/controls",
708
  "model_module_version": "2.0.0",
709
  "model_name": "HTMLStyleModel",
@@ -713,29 +748,15 @@
713
  "text_color": null
714
  }
715
  },
716
- "1a82e23ee0b44ec78b0bb2175f2e938a": {
717
- "model_module": "@jupyter-widgets/base",
718
- "model_module_version": "2.0.0",
719
- "model_name": "LayoutModel",
720
- "state": {}
721
- },
722
- "3058e249f3a24b89a0946db9d46692cd": {
723
  "model_module": "@jupyter-widgets/controls",
724
  "model_module_version": "2.0.0",
725
- "model_name": "HTMLModel",
726
  "state": {
727
- "layout": "IPY_MODEL_1a82e23ee0b44ec78b0bb2175f2e938a",
728
- "style": "IPY_MODEL_04362bf5ea1540e69a8ed37243e960fe",
729
- "value": " 17.6M/17.6M [00:00<00:00, 30.6MB/s]"
730
  }
731
  },
732
- "4bcb44aa9960417da7c3e374f5015413": {
733
- "model_module": "@jupyter-widgets/base",
734
- "model_module_version": "2.0.0",
735
- "model_name": "LayoutModel",
736
- "state": {}
737
- },
738
- "572a4d1b74044da7a90c58c311a87eff": {
739
  "model_module": "@jupyter-widgets/controls",
740
  "model_module_version": "2.0.0",
741
  "model_name": "HTMLStyleModel",
@@ -745,60 +766,64 @@
745
  "text_color": null
746
  }
747
  },
748
- "575da4c5a0b147989fc3444c95d5483b": {
749
- "model_module": "@jupyter-widgets/base",
750
- "model_module_version": "2.0.0",
751
- "model_name": "LayoutModel",
752
- "state": {}
753
- },
754
- "8f403fa494c246c9af5ee00397ac6cf5": {
755
- "model_module": "@jupyter-widgets/base",
756
- "model_module_version": "2.0.0",
757
- "model_name": "LayoutModel",
758
- "state": {}
759
- },
760
- "916778013b8d48d9acddd42e8b874c22": {
761
  "model_module": "@jupyter-widgets/controls",
762
  "model_module_version": "2.0.0",
763
- "model_name": "ProgressStyleModel",
764
  "state": {
765
- "description_width": ""
 
 
 
 
 
766
  }
767
  },
768
- "9c74511b86c240a9afb83e5dcd16b03b": {
769
  "model_module": "@jupyter-widgets/controls",
770
  "model_module_version": "2.0.0",
771
  "model_name": "FloatProgressModel",
772
  "state": {
773
  "bar_style": "success",
774
- "layout": "IPY_MODEL_575da4c5a0b147989fc3444c95d5483b",
775
- "max": 17552376,
776
- "style": "IPY_MODEL_916778013b8d48d9acddd42e8b874c22",
777
- "value": 17552376
778
  }
779
  },
780
- "9fb53962769d48e6a7ee640072ff1908": {
781
  "model_module": "@jupyter-widgets/controls",
782
  "model_module_version": "2.0.0",
783
  "model_name": "HTMLModel",
784
  "state": {
785
- "layout": "IPY_MODEL_4bcb44aa9960417da7c3e374f5015413",
786
- "style": "IPY_MODEL_572a4d1b74044da7a90c58c311a87eff",
787
- "value": "model.safetensors: 100%"
788
  }
789
  },
790
- "eb84b40edbab4e9d91fd6283b144492f": {
791
- "model_module": "@jupyter-widgets/controls",
792
  "model_module_version": "2.0.0",
793
- "model_name": "HBoxModel",
794
- "state": {
795
- "children": [
796
- "IPY_MODEL_9fb53962769d48e6a7ee640072ff1908",
797
- "IPY_MODEL_9c74511b86c240a9afb83e5dcd16b03b",
798
- "IPY_MODEL_3058e249f3a24b89a0946db9d46692cd"
799
- ],
800
- "layout": "IPY_MODEL_8f403fa494c246c9af5ee00397ac6cf5"
801
- }
 
 
 
 
 
 
 
 
 
 
 
802
  }
803
  },
804
  "version_major": 2,
 
14
  "id": "73e72549-69f2-46b5-b0f5-655777139972",
15
  "metadata": {
16
  "execution": {
17
+ "iopub.execute_input": "2025-01-22T18:16:12.117877Z",
18
+ "iopub.status.busy": "2025-01-22T18:16:12.117575Z",
19
+ "iopub.status.idle": "2025-01-22T18:16:15.083870Z",
20
+ "shell.execute_reply": "2025-01-22T18:16:15.083640Z",
21
+ "shell.execute_reply.started": "2025-01-22T18:16:12.117851Z"
22
  }
23
  },
24
  "outputs": [],
 
45
  "id": "07e0787e-c72b-41f3-baba-43cef3f8d6f8",
46
  "metadata": {
47
  "execution": {
48
+ "iopub.execute_input": "2025-01-22T18:16:15.084435Z",
49
+ "iopub.status.busy": "2025-01-22T18:16:15.084268Z",
50
+ "iopub.status.idle": "2025-01-22T18:16:15.086255Z",
51
+ "shell.execute_reply": "2025-01-22T18:16:15.086031Z",
52
+ "shell.execute_reply.started": "2025-01-22T18:16:15.084427Z"
53
  }
54
  },
55
  "outputs": [],
 
71
  "id": "d4b79fb9-5e70-4600-8885-94bc0a6e917c",
72
  "metadata": {
73
  "execution": {
74
+ "iopub.execute_input": "2025-01-22T18:16:15.086764Z",
75
+ "iopub.status.busy": "2025-01-22T18:16:15.086669Z",
76
+ "iopub.status.idle": "2025-01-22T18:16:15.091701Z",
77
+ "shell.execute_reply": "2025-01-22T18:16:15.091514Z",
78
+ "shell.execute_reply.started": "2025-01-22T18:16:15.086757Z"
79
  }
80
  },
81
  "outputs": [],
 
199
  "id": "07131bce-23ad-4787-8622-cce401f3e5ce",
200
  "metadata": {
201
  "execution": {
202
+ "iopub.execute_input": "2025-01-22T18:16:15.092028Z",
203
+ "iopub.status.busy": "2025-01-22T18:16:15.091969Z",
204
+ "iopub.status.idle": "2025-01-22T18:16:15.108312Z",
205
+ "shell.execute_reply": "2025-01-22T18:16:15.108075Z",
206
+ "shell.execute_reply.started": "2025-01-22T18:16:15.092021Z"
207
  }
208
  },
209
  "outputs": [],
 
223
  "id": "695bc080-bbd7-4937-af5b-50db1c936500",
224
  "metadata": {
225
  "execution": {
226
+ "iopub.execute_input": "2025-01-22T18:16:15.108777Z",
227
+ "iopub.status.busy": "2025-01-22T18:16:15.108669Z",
228
+ "iopub.status.idle": "2025-01-22T18:16:15.111839Z",
229
+ "shell.execute_reply": "2025-01-22T18:16:15.111545Z",
230
+ "shell.execute_reply.started": "2025-01-22T18:16:15.108767Z"
231
  }
232
  },
233
  "outputs": [],
 
305
  "# Exploration"
306
  ]
307
  },
308
+ {
309
+ "cell_type": "code",
310
+ "execution_count": 6,
311
+ "id": "11890d3b-8bcb-4a9b-b421-5431081cca39",
312
+ "metadata": {
313
+ "execution": {
314
+ "iopub.execute_input": "2025-01-22T18:16:15.113676Z",
315
+ "iopub.status.busy": "2025-01-22T18:16:15.113576Z",
316
+ "iopub.status.idle": "2025-01-22T18:16:15.115080Z",
317
+ "shell.execute_reply": "2025-01-22T18:16:15.114867Z",
318
+ "shell.execute_reply.started": "2025-01-22T18:16:15.113668Z"
319
+ }
320
+ },
321
+ "outputs": [],
322
+ "source": [
323
+ "base_model_repo = \"google/bert_uncased_L-12_H-768_A-12\"\n",
324
+ "model_and_repo_name = \"frugal-ai-text-bert-base\""
325
+ ]
326
+ },
327
  {
328
  "cell_type": "markdown",
329
  "id": "a847135f-ce86-46a1-9c61-3459a847cb29",
 
342
  },
343
  {
344
  "cell_type": "code",
345
+ "execution_count": 7,
346
  "id": "34a7c310-c486-4db1-b94d-4363c3d3df5b",
347
  "metadata": {
348
  "execution": {
349
+ "iopub.execute_input": "2025-01-22T18:16:15.115472Z",
350
+ "iopub.status.busy": "2025-01-22T18:16:15.115400Z",
351
+ "iopub.status.idle": "2025-01-22T18:19:33.994125Z",
352
+ "shell.execute_reply": "2025-01-22T18:19:33.993854Z",
353
+ "shell.execute_reply.started": "2025-01-22T18:16:15.115464Z"
354
  }
355
  },
356
  "outputs": [
 
358
  "name": "stdout",
359
  "output_type": "stream",
360
  "text": [
361
+ "2025-01-22 13:16:38 Epoch 0/3 done. Loss: Train 2.066, Test 2.091; and Acc: Train 0.185, Test 0.157\n",
362
+ "2025-01-22 13:17:36 Epoch 1/3 done. Loss: Train 1.089, Test 1.279; and Acc: Train 0.627, Test 0.555\n",
363
+ "2025-01-22 13:18:35 Epoch 2/3 done. Loss: Train 0.624, Test 1.044; and Acc: Train 0.839, Test 0.642\n",
364
+ "2025-01-22 13:19:33 Epoch 3/3 done. Loss: Train 0.294, Test 1.047; and Acc: Train 0.928, Test 0.648\n"
365
  ]
366
  }
367
  ],
368
  "source": [
369
  "model, tokenizer, regime, metrics = run_training(\n",
370
  " max_dataset_size=16 * 100,\n",
371
+ " bert_variety=base_model_repo,\n",
372
  " max_length=128,\n",
373
  " num_epochs=3,\n",
374
+ " batch_size=16,\n",
375
  ")"
376
  ]
377
  },
378
  {
379
  "cell_type": "code",
380
+ "execution_count": 8,
381
  "id": "0aedfcca-843e-4f4c-8062-3e4625161bcc",
382
  "metadata": {
383
  "editable": true,
384
  "execution": {
385
+ "iopub.execute_input": "2025-01-22T18:19:33.994637Z",
386
+ "iopub.status.busy": "2025-01-22T18:19:33.994547Z",
387
+ "iopub.status.idle": "2025-01-22T18:19:34.064925Z",
388
+ "shell.execute_reply": "2025-01-22T18:19:34.064678Z",
389
+ "shell.execute_reply.started": "2025-01-22T18:19:33.994628Z"
390
  },
391
  "slideshow": {
392
  "slide_type": ""
 
398
  "name": "stdout",
399
  "output_type": "stream",
400
  "text": [
401
+ "2025-01-22 13:19:34 Predictions: tensor([0, 0, 3, 6, 2, 4, 6], device='mps:0')\n"
402
  ]
403
  }
404
  ],
 
445
  "Overall top performance per model. Machine: bert-base is using an Nvidia 1xL40S, no inference time cleaverness attempted.\n",
446
  "\n",
447
  "[accidentally cheating bert-base by trainging on full dataset](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250117_220350.json):\\\n",
448
+ "acc 0.954, energy 0.736 Wh\n",
449
  "\n",
450
  "[bert-base some hp tuning](https://huggingface.co/datasets/frugal-ai-challenge/public-leaderboard-text/blob/main/submissions/Nonnormalizable_20250120_231350.json):\\\n",
451
+ "acc 0.707, energy 0.803 Wh\n",
452
  "\n",
453
  "bert-tiny, Nvidia 1xL40S:\n",
454
  "\n",
 
464
  "Scanning max_length and batch_size with num_epochs set to 3, looks like we want 256 and 16. That gets us\\\n",
465
  "`2025-01-21 10:18:56 Epoch 3/3 done. Loss: Train 1.368, Test 1.432; and Acc: Train 0.499, Test 0.477`.\n",
466
  "\n",
467
+ "Then looking at num_epochs, we saturate test set performance at 15 (~3 minutes), giving e.g.\\\n",
468
+ "`2025-01-21 10:38:30 Epoch 15/20 done. Loss: Train 0.553, Test 1.157; and Acc: Train 0.833, Test 0.595`\n",
469
+ "\n",
470
+ "For bert-mini, just looking at num_epochs, we choose 8\\\n",
471
+ "`2025-01-22 10:56:12 Epoch 8/20 done. Loss: Train 0.305, Test 1.090; and Acc: Train 0.920, Test 0.646`\n",
472
+ "\n",
473
+ "For bert-small, 4\\\n",
474
+ "`2025-01-22 11:39:41 Epoch 4/15 done. Loss: Train 0.301, Test 0.978; and Acc: Train 0.920, Test 0.664`\n",
475
+ "\n",
476
+ "For bert-medium, 4\\\n",
477
+ "`2025-01-22 12:09:51 Epoch 4/10 done. Loss: Train 0.294, Test 1.020; and Acc: Train 0.922, Test 0.660`\n",
478
+ "\n",
479
+ "For bert-base, 3 does happen to be correct, just checking for completeness\\\n",
480
+ "`2025-01-22 12:59:10 Epoch 3/7 done. Loss: Train 0.156, Test 0.930; and Acc: Train 0.964, Test 0.703`"
481
  ]
482
  },
483
  {
 
486
  "id": "37794952-703c-466c-9d26-ee6cb2834246",
487
  "metadata": {
488
  "execution": {
489
+ "iopub.execute_input": "2025-01-22T18:19:34.065427Z",
490
+ "iopub.status.busy": "2025-01-22T18:19:34.065327Z",
491
+ "iopub.status.idle": "2025-01-22T18:19:34.066925Z",
492
+ "shell.execute_reply": "2025-01-22T18:19:34.066714Z",
493
+ "shell.execute_reply.started": "2025-01-22T18:19:34.065418Z"
494
  }
495
  },
496
  "outputs": [],
497
  "source": [
498
  "static_hyperparams = dict(\n",
499
  " max_dataset_size=\"full\",\n",
500
+ " bert_variety=base_model_repo,\n",
501
  " max_length=256,\n",
502
  " batch_size=16,\n",
503
  ")"
 
509
  "id": "28354e8c-886a-4523-8968-8c688c13f6a3",
510
  "metadata": {
511
  "execution": {
512
+ "iopub.execute_input": "2025-01-22T18:19:34.067286Z",
513
+ "iopub.status.busy": "2025-01-22T18:19:34.067206Z",
514
+ "iopub.status.idle": "2025-01-22T18:38:14.108104Z",
515
+ "shell.execute_reply": "2025-01-22T18:38:14.107193Z",
516
+ "shell.execute_reply.started": "2025-01-22T18:19:34.067278Z"
517
  }
518
  },
519
  "outputs": [
 
521
  "name": "stdout",
522
  "output_type": "stream",
523
  "text": [
524
+ "2025-01-22 13:21:10 Epoch 0/3 done. Loss: Train 2.088, Test 2.085; and Acc: Train 0.137, Test 0.135\n",
525
+ "2025-01-22 13:26:50 Epoch 1/3 done. Loss: Train 0.780, Test 1.012; and Acc: Train 0.747, Test 0.648\n",
526
+ "2025-01-22 13:32:30 Epoch 2/3 done. Loss: Train 0.346, Test 0.890; and Acc: Train 0.904, Test 0.689\n",
527
+ "2025-01-22 13:38:14 Epoch 3/3 done. Loss: Train 0.167, Test 0.968; and Acc: Train 0.959, Test 0.691\n"
 
 
 
 
 
 
 
 
 
 
 
 
528
  ]
529
  }
530
  ],
531
  "source": [
532
  "model, tokenizer, training_regime, testing_metrics = run_training(\n",
533
  " **static_hyperparams,\n",
534
+ " num_epochs=3,\n",
535
  ")"
536
  ]
537
  },
 
545
  },
546
  {
547
  "cell_type": "code",
548
+ "execution_count": 11,
549
  "id": "ec2516f9-79f2-4ae1-ab9a-9a51a7a50587",
550
  "metadata": {
551
  "execution": {
552
+ "iopub.execute_input": "2025-01-22T18:38:14.109094Z",
553
+ "iopub.status.busy": "2025-01-22T18:38:14.108996Z",
554
+ "iopub.status.idle": "2025-01-22T18:38:14.124982Z",
555
+ "shell.execute_reply": "2025-01-22T18:38:14.124768Z",
556
+ "shell.execute_reply.started": "2025-01-22T18:38:14.109081Z"
557
  },
558
  "scrolled": true
559
  },
560
  "outputs": [],
561
  "source": [
 
562
  "card_data = ModelCardData(\n",
563
  " model_name=model_and_repo_name,\n",
564
  " base_model=static_hyperparams[\"bert_variety\"],\n",
 
570
  ")\n",
571
  "card = ModelCard.from_template(\n",
572
  " card_data,\n",
573
+ " model_summary=f\"Classify text into 8 categories of climate misinformation using {base_model_repo}.\",\n",
574
  " model_description=\"Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.\",\n",
575
  " developers=\"Andre Bach\",\n",
576
  " funded_by=\"N/A\",\n",
 
586
  },
587
  {
588
  "cell_type": "code",
589
+ "execution_count": 12,
590
  "id": "29d3bbf9-ab2a-48e2-a550-e16da5025720",
591
  "metadata": {
592
  "execution": {
593
+ "iopub.execute_input": "2025-01-22T18:38:14.125523Z",
594
+ "iopub.status.busy": "2025-01-22T18:38:14.125395Z",
595
+ "iopub.status.idle": "2025-01-22T18:38:14.126978Z",
596
+ "shell.execute_reply": "2025-01-22T18:38:14.126771Z",
597
+ "shell.execute_reply.started": "2025-01-22T18:38:14.125514Z"
598
  }
599
  },
600
  "outputs": [],
 
605
  },
606
  {
607
  "cell_type": "code",
608
+ "execution_count": 13,
609
  "id": "e3b099c6-6b98-473b-8797-5032213b9fcb",
610
  "metadata": {
611
  "execution": {
612
+ "iopub.execute_input": "2025-01-22T18:38:14.127531Z",
613
+ "iopub.status.busy": "2025-01-22T18:38:14.127415Z",
614
+ "iopub.status.idle": "2025-01-22T18:38:14.157055Z",
615
+ "shell.execute_reply": "2025-01-22T18:38:14.156821Z",
616
+ "shell.execute_reply.started": "2025-01-22T18:38:14.127524Z"
617
  }
618
  },
619
  "outputs": [
 
621
  "name": "stdout",
622
  "output_type": "stream",
623
  "text": [
624
+ "2025-01-22 13:38:14 Predictions: tensor([0, 0, 3, 1, 2, 4, 6], device='mps:0')\n"
625
  ]
626
  }
627
  ],
 
654
  },
655
  {
656
  "cell_type": "code",
657
+ "execution_count": 14,
658
  "id": "befb94b5-88bf-40fc-8b26-cf373d1256e0",
659
  "metadata": {
660
  "execution": {
661
+ "iopub.execute_input": "2025-01-22T18:38:14.157429Z",
662
+ "iopub.status.busy": "2025-01-22T18:38:14.157356Z",
663
+ "iopub.status.idle": "2025-01-22T18:38:53.948196Z",
664
+ "shell.execute_reply": "2025-01-22T18:38:53.947738Z",
665
+ "shell.execute_reply.started": "2025-01-22T18:38:14.157421Z"
666
  }
667
  },
668
  "outputs": [
669
  {
670
+ "data": {
671
+ "application/vnd.jupyter.widget-view+json": {
672
+ "model_id": "54e4f39d398f45ceb760107e5b57744a",
673
+ "version_major": 2,
674
+ "version_minor": 0
675
+ },
676
+ "text/plain": [
677
+ "model.safetensors: 0%| | 0.00/438M [00:00<?, ?B/s]"
678
+ ]
679
+ },
680
+ "metadata": {},
681
+ "output_type": "display_data"
682
  },
683
  {
684
  "data": {
685
  "text/plain": [
686
+ "CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/46ba6471d612d348636c07c47f57d90dd14c9f74', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='46ba6471d612d348636c07c47f57d90dd14c9f74', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)"
687
  ]
688
  },
689
+ "execution_count": 14,
690
  "metadata": {},
691
  "output_type": "execute_result"
692
  }
 
728
  "widgets": {
729
  "application/vnd.jupyter.widget-state+json": {
730
  "state": {
731
+ "2d2b267cd60649cdb6fcce93640ba8d6": {
732
+ "model_module": "@jupyter-widgets/controls",
733
+ "model_module_version": "2.0.0",
734
+ "model_name": "HTMLModel",
735
+ "state": {
736
+ "layout": "IPY_MODEL_b3c2c88f904a424c96704cc4b9514f98",
737
+ "style": "IPY_MODEL_337bc700fce14480a640a1ae545db5f5",
738
+ "value": "model.safetensors: 100%"
739
+ }
740
+ },
741
+ "337bc700fce14480a640a1ae545db5f5": {
742
  "model_module": "@jupyter-widgets/controls",
743
  "model_module_version": "2.0.0",
744
  "model_name": "HTMLStyleModel",
 
748
  "text_color": null
749
  }
750
  },
751
+ "40666b0d750d4caf8fbaeeef11eb58c1": {
 
 
 
 
 
 
752
  "model_module": "@jupyter-widgets/controls",
753
  "model_module_version": "2.0.0",
754
+ "model_name": "ProgressStyleModel",
755
  "state": {
756
+ "description_width": ""
 
 
757
  }
758
  },
759
+ "4d9ae3c7a72a4f4aa5974fb0649cb42c": {
 
 
 
 
 
 
760
  "model_module": "@jupyter-widgets/controls",
761
  "model_module_version": "2.0.0",
762
  "model_name": "HTMLStyleModel",
 
766
  "text_color": null
767
  }
768
  },
769
+ "54e4f39d398f45ceb760107e5b57744a": {
 
 
 
 
 
 
 
 
 
 
 
 
770
  "model_module": "@jupyter-widgets/controls",
771
  "model_module_version": "2.0.0",
772
+ "model_name": "HBoxModel",
773
  "state": {
774
+ "children": [
775
+ "IPY_MODEL_2d2b267cd60649cdb6fcce93640ba8d6",
776
+ "IPY_MODEL_575f3681680a4cbeb1f95547a40bdc93",
777
+ "IPY_MODEL_91cbef62c3b84632949a24dbad475b10"
778
+ ],
779
+ "layout": "IPY_MODEL_f2feb8c3b4cc4ee29091b9aab78ff4aa"
780
  }
781
  },
782
+ "575f3681680a4cbeb1f95547a40bdc93": {
783
  "model_module": "@jupyter-widgets/controls",
784
  "model_module_version": "2.0.0",
785
  "model_name": "FloatProgressModel",
786
  "state": {
787
  "bar_style": "success",
788
+ "layout": "IPY_MODEL_dcc805dd65774cd2b863c2c4bb8f3f1c",
789
+ "max": 437977072,
790
+ "style": "IPY_MODEL_40666b0d750d4caf8fbaeeef11eb58c1",
791
+ "value": 437977072
792
  }
793
  },
794
+ "91cbef62c3b84632949a24dbad475b10": {
795
  "model_module": "@jupyter-widgets/controls",
796
  "model_module_version": "2.0.0",
797
  "model_name": "HTMLModel",
798
  "state": {
799
+ "layout": "IPY_MODEL_fe68949bcf9b42508368dd03f6506d57",
800
+ "style": "IPY_MODEL_4d9ae3c7a72a4f4aa5974fb0649cb42c",
801
+ "value": " 438M/438M [00:36&lt;00:00,12.1MB/s]"
802
  }
803
  },
804
+ "b3c2c88f904a424c96704cc4b9514f98": {
805
+ "model_module": "@jupyter-widgets/base",
806
  "model_module_version": "2.0.0",
807
+ "model_name": "LayoutModel",
808
+ "state": {}
809
+ },
810
+ "dcc805dd65774cd2b863c2c4bb8f3f1c": {
811
+ "model_module": "@jupyter-widgets/base",
812
+ "model_module_version": "2.0.0",
813
+ "model_name": "LayoutModel",
814
+ "state": {}
815
+ },
816
+ "f2feb8c3b4cc4ee29091b9aab78ff4aa": {
817
+ "model_module": "@jupyter-widgets/base",
818
+ "model_module_version": "2.0.0",
819
+ "model_name": "LayoutModel",
820
+ "state": {}
821
+ },
822
+ "fe68949bcf9b42508368dd03f6506d57": {
823
+ "model_module": "@jupyter-widgets/base",
824
+ "model_module_version": "2.0.0",
825
+ "model_name": "LayoutModel",
826
+ "state": {}
827
  }
828
  },
829
  "version_major": 2,