{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "5ea2cd46-5e4c-453c-bbef-69f3b3411765", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/user/home/dc.tavares/.conda/envs/ws2024/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import os\n", "import numpy as np\n", "\n", "# import transformers\n", "from transformers import (\n", " AutoModelForSequenceClassification,\n", " AutoTokenizer,\n", " Trainer,\n", " TrainingArguments,\n", ")\n", "from datasets import load_metric\n", "\n", "from dataset_loader import IntentDataset" ] }, { "cell_type": "code", "execution_count": 2, "id": "dd7d77de-a96c-43da-973e-9185e596ecd5", "metadata": {}, "outputs": [], "source": [ "# transformers.logging.set_verbosity_info()\n", "# transformers.logging.set_verbosity_error() \n", "# We set the verbosity to error to avoid the annoying huggingface warnings \n", "# when loading models before training them. If you're having trouble getting things to work\n", "# maybe comment that line (setting the verbosity to info also may lead to interesting outputs!)\n", "# os.environ['TOKENIZERS_PARALLELISM'] = \"false\" # trainer (?) was complaining about parallel tokenization\n", "# os.environ[\"WANDB_DISABLED\"] = \"true\" # trainer was complaining about wandb" ] }, { "cell_type": "code", "execution_count": 3, "id": "1d62015d-faa8-452f-a1bd-63da4f88b90f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/user/home/dc.tavares/.conda/envs/ws2024/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n" ] } ], "source": [ "model_checkpoint_name = 'roberta-base' # try 'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased'\n", "dataset_name = 'twiz-data' # rename to your dataset dir\n", "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name) # loads a tokenizer\n", "tokenizer.save_pretrained(\"tokenizer\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "0d97d9ef-7412-402e-92cb-cf4c666e2cdb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded Intent detection dataset. 5916 examples. (train). \n", "Loaded Intent detection dataset. 819 examples. (val). \n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "train_dataset = IntentDataset(dataset_name, tokenizer, 'train') # check twiz_dataset.py for dataset loading code\n", "val_dataset = IntentDataset(dataset_name, tokenizer, 'val')\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_name, num_labels=len(train_dataset.all_intents)) # Loads the BERT model weights" ] }, { "cell_type": "code", "execution_count": 10, "id": "14adcad7-37ea-480d-85f4-f69e2ea1d431", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "All data keys: dict_keys(['input_ids', 'attention_mask', 'label'])\n", "tensor([ 0, 6715, 28, 7316, 77, 634, 143, 3270, 50, 2104,\n", " 4, 9427, 6, 1078, 78, 328, 1398, 16, 103, 335,\n", " 59, 26157, 8, 42446, 11182, 102, 4, 85, 34, 10,\n", " 204, 4, 398, 999, 691, 4, 1437, 85, 16, 2319,\n", " 7, 185, 59, 1718, 728, 479, 85, 4542, 204, 4,\n", " 3139, 9600, 672, 16, 18609, 4, 1437, 318, 42, 16,\n", " 45, 1341, 99, 47, 32, 546, 13, 224, 6, 213,\n", " 124, 4, 598, 535, 5, 3685, 6, 95, 224, 6,\n", " 311, 7075, 4, 2, 2, 12005, 7075, 2, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) torch.Size([180])\n" ] }, { "data": { "text/plain": [ "(tensor(29), 'IngredientsConfirmationIntent')" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inspect_index = 0\n", "print('All data keys:', train_dataset[inspect_index].keys())\n", "print(train_dataset[inspect_index]['input_ids'], train_dataset[inspect_index]['input_ids'].shape)\n", "# you can check the correspondence of a label by checking the all_intents attribute, as such:\n", "train_dataset[inspect_index]['label'], train_dataset.all_intents[train_dataset[inspect_index]['label']]" ] }, { "cell_type": "code", "execution_count": 6, "id": "efd44ee5-19fa-434b-b187-b2b219b0f472", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_432924/3219055009.py:1: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate\n", " acc = load_metric('accuracy')\n", "/user/home/dc.tavares/.conda/envs/ws2024/lib/python3.10/site-packages/datasets/load.py:759: FutureWarning: The repository for accuracy contains custom code which must be executed to correctly load the metric. You can inspect the repository content at https://raw.githubusercontent.com/huggingface/datasets/2.19.1/metrics/accuracy/accuracy.py\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.\n", " warnings.warn(\n", "Using the latest cached version of the module from /user/home/dc.tavares/.cache/huggingface/modules/datasets_modules/metrics/accuracy/bbddc2dafac9b46b0aeeb39c145af710c55e03b223eae89dfe86388f40d9d157 (last modified on Wed May 18 17:06:59 2022) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.\n" ] } ], "source": [ "acc = load_metric('accuracy')\n", "def compute_metrics(eval_pred):\n", " logits, labels = eval_pred\n", " predictions = np.argmax(logits, axis=-1)\n", " accuracy = acc.compute(predictions=predictions, references=labels)\n", " return accuracy\n", "\n", "def get_trainer(model):\n", " return Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=val_dataset,\n", " compute_metrics=compute_metrics,\n", " )\n", "\n", "training_args = TrainingArguments(\n", " output_dir='roberta-based',\n", " do_train=True,\n", " do_eval=True,\n", " evaluation_strategy='epoch',\n", " save_strategy='epoch',\n", " logging_strategy='epoch',\n", " metric_for_best_model='accuracy',\n", " learning_rate=2e-5,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", " per_device_train_batch_size=32,\n", " per_device_eval_batch_size=32,\n", " load_best_model_at_end=True,\n", " disable_tqdm=False,\n", ")\n", "\n", "trainer = get_trainer(model)" ] }, { "cell_type": "code", "execution_count": 7, "id": "4246f805-195b-47dd-9216-9eb5a3a0bcac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "
---|---|---|---|
1 | \n", "1.733200 | \n", "1.017632 | \n", "0.799756 | \n", "
2 | \n", "0.676700 | \n", "0.734118 | \n", "0.829060 | \n", "
3 | \n", "0.446900 | \n", "0.668322 | \n", "0.847375 | \n", "
4 | \n", "0.343500 | \n", "0.640882 | \n", "0.852259 | \n", "
5 | \n", "0.282900 | \n", "0.641061 | \n", "0.857143 | \n", "
"
],
"text/plain": [
"