In [1]:
from datetime import datetime
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from huggingface_hub import PyTorchModelHubMixin, notebook_login
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

In [2]:
notebook_login(new_session=False)

In [11]:
def my_print(x):
    time_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(time_str, x)


class BertClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, num_labels=8, bert_variety="bert-base-uncased"):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_variety)
        self.dropout = nn.Dropout(0.05)
        self.classifier = nn.Linear(self.bert.pooler.dense.out_features, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits


class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors="pt",
        )
        self.labels = torch.tensor([int(l[0]) for l in labels])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self) -> int:
        return len(self.labels)


def train_model(model, train_dataloader, device, num_epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    model.train()

    my_print("Starting epoch 1.")
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_dataloader)
        my_print(f"Epoch {epoch+1}/{num_epochs} done, Average Loss: {avg_loss:0.4f}")

In [4]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    torch.mps.empty_cache()
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [5]:
def run_training(
    max_dataset_size=16 * 200,
    bert_variety="bert-base-uncased",
    max_length=200,
    num_epochs=3,
    batch_size=32,
):
    hf_dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")
    if not max_dataset_size == "full" and max_dataset_size < len(hf_dataset["train"]):
        train_dataset = hf_dataset["train"][:max_dataset_size]
    else:
        train_dataset = hf_dataset["train"]

    tokenizer = BertTokenizer.from_pretrained(bert_variety, max_length=max_length)
    model = BertClassifier(bert_variety=bert_variety)
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        torch.mps.empty_cache()
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)

    dataset = TextDataset(
        train_dataset["quote"],
        train_dataset["label"],
        tokenizer=tokenizer,
        max_length=max_length,
    )
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    train_model(model, dataloader, device, num_epochs=num_epochs)
    return model, tokenizer

In [19]:
model, tokenizer = run_training(
    max_dataset_size=16 * 100,
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=32,
)

2025-01-17 07:22:44 Starting epoch 1.
2025-01-17 07:23:21 Epoch 1/3 done, Average Loss: 1.8129
2025-01-17 07:23:58 Epoch 2/3 done, Average Loss: 1.3089
2025-01-17 07:24:35 Epoch 3/3 done, Average Loss: 0.8916


In [21]:
model.eval()
test_text = [
    "This was a great experience!",  # 0_not_relevant
    "My favorite hike is Laguna de los Tres.",  # 0_not_relevant
    "Crops will grow great in Finland if it's warmer there.",  # 3_not_bad
    "Climate change is fake.",  # 1_not_happening
    "The apparent warming is caused by solar cycles.",  # 2_not_human
    "Solar panels emit bad vibes.",  # 4_solutions_harmful_unnecessary
    "All those so-called scientists are Democrats.",  # 6_proponents_biased
]
test_encoding = tokenizer(
    test_text,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

with torch.no_grad():
    test_input_ids = test_encoding["input_ids"].to(device)
    test_attention_mask = test_encoding["attention_mask"].to(device)
    outputs = model(test_input_ids, test_attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    my_print(f"Predictions: {predictions}")

2025-01-17 07:24:47 Predictions: tensor([0, 1, 3, 6, 2, 3, 6], device='mps:0')


In [7]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=64,
    num_epochs=3,
    batch_size=32,
)

2025-01-16 20:47:23 Starting epoch 1.
2025-01-16 20:48:35 Epoch 1/3 done, Average Loss: 1.4272
2025-01-16 20:49:46 Epoch 2/3 done, Average Loss: 0.8694
2025-01-16 20:50:59 Epoch 3/3 done, Average Loss: 0.5774


In [8]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=32,
)

2025-01-16 20:51:04 Starting epoch 1.
2025-01-16 20:53:20 Epoch 1/3 done, Average Loss: 1.4107
2025-01-16 20:55:41 Epoch 2/3 done, Average Loss: 0.8491
2025-01-16 20:58:02 Epoch 3/3 done, Average Loss: 0.5359


In [9]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=16,
)

2025-01-16 20:58:08 Starting epoch 1.
2025-01-16 21:00:38 Epoch 1/3 done, Average Loss: 1.2946
2025-01-16 21:03:07 Epoch 2/3 done, Average Loss: 0.7425
2025-01-16 21:05:36 Epoch 3/3 done, Average Loss: 0.4126


In [12]:
model, tokenizer = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=256,
    num_epochs=3,
    batch_size=16,
)

2025-01-17 10:35:20 Starting epoch 1.
2025-01-17 10:40:29 Epoch 1/3 done, Average Loss: 1.2876
2025-01-17 10:45:37 Epoch 2/3 done, Average Loss: 0.7289
2025-01-17 10:50:43 Epoch 3/3 done, Average Loss: 0.3990


# Model to upload

In [6]:
model_final, tokenizer_final = run_training(
    max_dataset_size="full",
    bert_variety="bert-base-uncased",
    max_length=128,
    num_epochs=3,
    batch_size=16,
)

2025-01-17 10:19:17 Starting epoch 1.
2025-01-17 10:21:47 Epoch 1/3 done, Average Loss: 1.2608
2025-01-17 10:24:16 Epoch 2/3 done, Average Loss: 0.7134
2025-01-17 10:26:45 Epoch 3/3 done, Average Loss: 0.3931


In [7]:
model_final.eval()
test_text = [
    "This was a great experience!",  # 0_not_relevant
    "My favorite hike is Laguna de los Tres.",  # 0_not_relevant
    "Crops will grow great in Finland if it's warmer there.",  # 3_not_bad
    "Climate change is fake.",  # 1_not_happening
    "The apparent warming is caused by solar cycles.",  # 2_not_human
    "Solar panels emit bad vibes.",  # 4_solutions_harmful_unnecessary
    "All those so-called scientists are Democrats.",  # 6_proponents_biased
]
test_encoding = tokenizer_final(
    test_text,
    truncation=True,
    padding=True,
    return_tensors="pt",
)

with torch.no_grad():
    test_input_ids = test_encoding["input_ids"].to(device)
    test_attention_mask = test_encoding["attention_mask"].to(device)
    outputs = model_final(test_input_ids, test_attention_mask)
    predictions = torch.argmax(outputs, dim=1)
    my_print(f"Predictions: {predictions}")

2025-01-17 10:26:45 Predictions: tensor([0, 0, 3, 1, 2, 4, 6], device='mps:0')


In [10]:
model_final.push_to_hub("frugal-ai-text-bert-base")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/bd94aa1344798fcf671ddd5f8a7bd4f4dc0b20c4', commit_message='Push model using huggingface_hub.', commit_description='', oid='bd94aa1344798fcf671ddd5f8a7bd4f4dc0b20c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)

In [9]:
tokenizer_final.push_to_hub("frugal-ai-text-bert-base")

README.md:   0%|          | 0.00/320 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base/commit/9814436ad5f77cd8c607aa5dba9b67e7983e8ca7', commit_message='Upload tokenizer', commit_description='', oid='9814436ad5f77cd8c607aa5dba9b67e7983e8ca7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Nonnormalizable/frugal-ai-text-bert-base', endpoint='https://huggingface.co', repo_type='model', repo_id='Nonnormalizable/frugal-ai-text-bert-base'), pr_revision=None, pr_num=None)