# Setup

In [1]:
from datetime import datetime
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from huggingface_hub import (
    PyTorchModelHubMixin,
    notebook_login,
    ModelCard,
    ModelCardData,
    EvalResult,
)
from datasets import DatasetDict, load_dataset
from torch.utils.data import Dataset, DataLoader

In [2]:
notebook_login(new_session=False)

# Functions

In [3]:
def my_print(x):
    time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(time_str, x)


def model_metrics(model, dataloader):
    criterion = nn.CrossEntropyLoss()
    model.eval()
    with torch.no_grad():
        total_loss = 0
        total_correct = 0
        total_length = 0
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            predictions_cpu = torch.argmax(outputs, dim=1).cpu().numpy()
            labels_cpu = labels.cpu().numpy()
            correct_count = (predictions_cpu == labels_cpu).sum()

            total_loss += loss.item()
            total_correct += correct_count
            total_length += len(labels_cpu)
        avg_loss = total_loss / len(dataloader)
        avg_acc = total_correct / total_length
    model.train()
    return avg_loss, avg_acc


def print_model_status(epoch, num_epochs, model, train_dataloader, test_dataloader):
    train_loss, train_acc = model_metrics(model, train_dataloader)
    test_loss, test_acc = model_metrics(model, test_dataloader)
    loss_str = f"Loss: Train {train_loss:0.3f}, Test {test_loss:0.3f}"
    acc_str = f"Acc: Train {train_acc:0.3f}, Test {test_acc:0.3f}"
    my_print(f"Epoch {epoch+1}/{num_epochs} done. {loss_str}; and {acc_str}")


class BertClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, num_labels=8, bert_variety="bert-base-uncased"):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_variety)
        self.dropout = nn.Dropout(0.05)
        self.classifier = nn.Linear(self.bert.pooler.dense.out_features, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors="pt",
        )
        self.labels = torch.tensor([int(l[0]) for l in labels])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self) -> int:
        return len(self.labels)


def train_model(model, train_dataloader, test_dataloader, device, num_epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    model.train()

    print_model_status(-1, num_epochs, model, train_dataloader, test_dataloader)
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_dataloader)
        print_model_status(epoch, num_epochs, model, train_dataloader, test_dataloader)

In [4]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    torch.mps.empty_cache()
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [5]:
def run_training(
    max_dataset_size=16 * 200,
    bert_variety="bert-base-uncased",
    max_length=200,
    num_epochs=3,
    batch_size=32,
):
    hf_dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
    test_size = 0.2
    test_seed = 42
    train_test = hf_dataset["train"].train_test_split(
        test_size=test_size, seed=test_seed
    )
    train_dataset = train_test["train"]
    test_dataset = train_test["test"]
    if not max_dataset_size == "full" and max_dataset_size < len(hf_dataset["train"]):
        train_dataset = train_dataset[:max_dataset_size]
        test_dataset = test_dataset[:max_dataset_size]
    else:
        train_dataset = train_dataset
        test_dataset = test_dataset

    tokenizer = BertTokenizer.from_pretrained(bert_variety, max_length=max_length)
    model = BertClassifier(bert_variety=bert_variety)
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        torch.mps.empty_cache()
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)

    text_dataset_train = TextDataset(
        train_dataset["quote"],
        train_dataset["label"],
        tokenizer=tokenizer,
        max_length=max_length,
    )
    text_dataset_test = TextDataset(
        test_dataset["quote"],
        test_dataset["label"],
        tokenizer=tokenizer,
        max_length=max_length,
    )
    dataloader_train = DataLoader(
        text_dataset_train, batch_size=batch_size, shuffle=True
    )
    dataloader_test = DataLoader(
        text_dataset_test, batch_size=batch_size, shuffle=False
    )

    train_model(model, dataloader_train, dataloader_test, device, num_epochs=num_epochs)
    return model, tokenizer

# Exploration

## Check if runs

In [6]:
model, tokenizer = run_training(
    max_dataset_size=16 * 2,
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=32,
)

2025-01-20 12:17:10 Epoch 0/3 done. Loss: Train 2.111, Test 2.247; and Acc: Train 0.281, Test 0.156
2025-01-20 12:17:11 Epoch 1/3 done. Loss: Train 2.026, Test 2.222; and Acc: Train 0.344, Test 0.156
2025-01-20 12:17:12 Epoch 2/3 done. Loss: Train 1.943, Test 2.194; and Acc: Train 0.312, Test 0.156
2025-01-20 12:17:14 Epoch 3/3 done. Loss: Train 1.859, Test 2.159; and Acc: Train 0.344, Test 0.156


In [7]:
model.eval()
test_text = [
    "This was a great experience!",  # 0_not_relevant
    "My favorite hike is Laguna de los Tres.",  # 0_not_relevant
    "Crops will grow great in Finland if it's warmer there.",  # 3_not_bad
    "Climate change is fake.",  # 1_not_happening
    "The apparent warming is caused by solar cycles.",  # 2_not_human
    "Solar panels emit bad vibes.",  # 4_solutions_harmful_unnecessary
    "All those so-called scientists are Democrats.",  # 6_proponents_biased
]
test_encoding = tokenizer(
    test_text,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

with torch.no_grad():
    test_input_ids = test_encoding["input_ids"].to(device)
    test_attention_mask = test_encoding["attention_mask"].to(device)
    outputs = model(test_input_ids, test_attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    my_print(f"Predictions: {predictions}")

2025-01-20 12:17:14 Predictions: tensor([4, 1, 1, 1, 3, 1, 1], device='mps:0')


## Hyperparameters

In [8]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=32,
)

2025-01-20 12:18:02 Epoch 0/3 done. Loss: Train 2.106, Test 2.091; and Acc: Train 0.118, Test 0.135
2025-01-20 12:20:37 Epoch 1/3 done. Loss: Train 0.989, Test 1.114; and Acc: Train 0.647, Test 0.603
2025-01-20 12:23:12 Epoch 2/3 done. Loss: Train 0.584, Test 0.928; and Acc: Train 0.825, Test 0.669
2025-01-20 12:25:46 Epoch 3/3 done. Loss: Train 0.313, Test 0.950; and Acc: Train 0.913, Test 0.683


In [9]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=16,
)

2025-01-20 12:26:34 Epoch 0/3 done. Loss: Train 2.174, Test 2.168; and Acc: Train 0.096, Test 0.094
2025-01-20 12:29:21 Epoch 1/3 done. Loss: Train 0.878, Test 1.033; and Acc: Train 0.712, Test 0.653
2025-01-20 12:32:07 Epoch 2/3 done. Loss: Train 0.458, Test 0.906; and Acc: Train 0.869, Test 0.678
2025-01-20 12:34:54 Epoch 3/3 done. Loss: Train 0.218, Test 0.959; and Acc: Train 0.944, Test 0.695


In [10]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=256,
    num_epochs=3,
    batch_size=16,
)

2025-01-20 12:36:37 Epoch 0/3 done. Loss: Train 2.122, Test 2.127; and Acc: Train 0.122, Test 0.118
2025-01-20 12:42:26 Epoch 1/3 done. Loss: Train 0.779, Test 0.978; and Acc: Train 0.748, Test 0.652
2025-01-20 12:48:16 Epoch 2/3 done. Loss: Train 0.391, Test 0.884; and Acc: Train 0.897, Test 0.696
2025-01-20 12:54:07 Epoch 3/3 done. Loss: Train 0.154, Test 0.978; and Acc: Train 0.959, Test 0.705


# Model to upload

In [14]:
model_and_repo_name = "frugal-ai-text-bert-base"
card_data = ModelCardData(
    model_name=model_and_repo_name,
    base_model="google-bert/bert-base-uncased",
    license="apache-2.0",
    language=["en"],
    datasets=["QuotaClimat/frugalaichallenge-text-train"],
    tags=["model_hub_mixin", "pytorch_model_hub_mixin", "climate"],
    pipeline_tag="text-classification",
)
card = ModelCard.from_template(
    card_data,
    model_summary="Classify text into 8 categories of climate misinformation.",
    model_description="Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.",
    developers="Andre Bach",
    funded_by="N/A",
    shared_by="Andre Bach",
    model_type="Text classification",
    repo=model_and_repo_name,
    training_regime=dict(
        max_dataset_size="full",
        bert_variety="bert-base-uncased",
        max_length=256,
        num_epochs=3,
        batch_size=16,
    ),
    testing_metrics=dict(
        loss_train=0.154, loss_test=0.978, acc_train=0.959, acc_test=0.705
    ),
)
# print(card_data.to_yaml())
print(card)

---
base_model: google-bert/bert-base-uncased
datasets:
- QuotaClimat/frugalaichallenge-text-train
language:
- en
license: apache-2.0
model_name: frugal-ai-text-bert-base
pipeline_tag: text-classification
tags:
- model_hub_mixin
- pytorch_model_hub_mixin
- climate
---

# Model Card for Model ID

<!-- Provide a quick summary of what the model is/does. -->

Classify text into 8 categories of climate misinformation.

## Model Details

### Model Description

<!-- Provide a longer summary of what this model is. -->

Fine trained BERT for classifying climate information as part of the Frugal AI Challenge, for submission to https://huggingface.co/frugal-ai-challenge and scoring on accuracy and efficiency. Trainied on only the non-evaluation 80% of the data, so it's (non-cheating) score will be lower.

- **Developed by:** Andre Bach
- **Funded by [optional]:** N/A
- **Shared by [optional]:** Andre Bach
- **Model type:** Text classification
- **Language(s) (NLP):** ['en']
- **License:** apache-

In [17]:
model_final = model
tokenizer_final = tokenizer

In [18]:
model_final.eval()
test_text = [
    "This was a great experience!",  # 0_not_relevant
    "My favorite hike is Laguna de los Tres.",  # 0_not_relevant
    "Crops will grow great in Finland if it's warmer there.",  # 3_not_bad
    "Climate change is fake.",  # 1_not_happening
    "The apparent warming is caused by solar cycles.",  # 2_not_human
    "Solar panels emit bad vibes.",  # 4_solutions_harmful_unnecessary
    "All those so-called scientists are Democrats.",  # 6_proponents_biased
]
test_encoding = tokenizer_final(
    test_text,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

with torch.no_grad():
    test_input_ids = test_encoding["input_ids"].to(device)
    test_attention_mask = test_encoding["attention_mask"].to(device)
    outputs = model_final(test_input_ids, test_attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    my_print(f"Predictions: {predictions}")

2025-01-20 14:12:01 Predictions: tensor([0, 0, 3, 6, 2, 4, 6], device='mps:0')


In [19]:
model_final.push_to_hub("frugal-ai-text-bert-base")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/bdc2daf80d9647566ef56297f2cdc32f898170df', commit_message='Push model using huggingface_hub.', commit_description='', oid='bdc2daf80d9647566ef56297f2cdc32f898170df', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)

In [20]:
tokenizer_final.push_to_hub("frugal-ai-text-bert-base")

CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/9081285a20fa0d62c5c1580aa17884de2b3bc236', commit_message='Upload tokenizer', commit_description='', oid='9081285a20fa0d62c5c1580aa17884de2b3bc236', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)

In [21]:
card.push_to_hub("Nonnormalizable/frugal-ai-text-bert-base")

CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/b3078a95ea36d71c1d1bf0d153e069b83f74bddf', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='b3078a95ea36d71c1d1bf0d153e069b83f74bddf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)