{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "import transformers\n",
    "from transformers import (\n",
    "    CONFIG_MAPPING,\n",
    "    MODEL_FOR_CAUSAL_LM_MAPPING,\n",
    "    AutoConfig,\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    HfArgumentParser,\n",
    "    Trainer,\n",
    "    TrainingArguments,\n",
    "    default_data_collator,\n",
    "    is_torch_tpu_available,\n",
    "    set_seed,\n",
    ")\n",
    "\n",
    "from itertools import chain\n",
    "\n",
    "from transformers.testing_utils import CaptureLogger\n",
    "from transformers.trainer_utils import get_last_checkpoint\n",
    "# from transformers.utils import check_min_version, send_example_telemetry\n",
    "from transformers.utils.versions import require_version\n",
    "\n",
    "import datasets\n",
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers.",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcheck_min_version\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m4.23.0.dev0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/transformers/utils/__init__.py:32\u001b[0m, in \u001b[0;36mcheck_min_version\u001b[0;34m(min_version)\u001b[0m\n\u001b[1;32m     30\u001b[0m     error_message \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis example requires a minimum version of \u001b[39m\u001b[39m{\u001b[39;00mmin_version\u001b[39m}\u001b[39;00m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     31\u001b[0m error_message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m but the version found is \u001b[39m\u001b[39m{\u001b[39;00m__version__\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mImportError\u001b[39;00m(\n\u001b[1;32m     33\u001b[0m     error_message\n\u001b[1;32m     34\u001b[0m     \u001b[39m+\u001b[39m (\n\u001b[1;32m     35\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     36\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mversions of HuggingFace Transformers.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     37\u001b[0m     )\n\u001b[1;32m     38\u001b[0m )\n",
      "\u001b[0;31mImportError\u001b[0m: This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers."
     ]
    }
   ],
   "source": [
    "# check_min_version(\"4.23.0.dev0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "require_version(\"datasets>=1.8.0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_seed(37)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset csv/merve--poetry to /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ed56ee6b324647798b19ac7bf5accc40",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "32c10441ff20404cb153f6b27f16a829",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/606k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7ca47bc06937463e91d3948d7703ac64",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1631dbdc53d04b14a8a7733883bbd1cc",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0 tables [00:00, ? tables/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset csv downloaded and prepared to /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3c93229d66ad46d9a88da5f6a9528f2e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "raw_datasets = load_dataset(\"merve/poetry\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = AutoConfig.from_pretrained('gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(50257, 768)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"gpt2\",\n",
    "    config=config\n",
    ")\n",
    "model.resize_token_embeddings(len(tokenizer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['author', 'content', 'poem name', 'age', 'type'],\n",
       "    num_rows: 573\n",
       "})"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Mythology & Folklore'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets['train']['type'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['author', 'content', 'poem name', 'age', 'type'],\n",
       "        num_rows: 573\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok_logger = transformers.utils.logging.get_logger(\n",
    "    \"transformers.tokenization_utils_base\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_function(examples):\n",
    "    with CaptureLogger(tok_logger) as cl:\n",
    "        output = tokenizer(examples[text_column_name])\n",
    "    # clm input could be much much longer than block_size\n",
    "    if \"Token indices sequence length is longer than the\" in cl.out:\n",
    "        tok_logger.warning(\n",
    "            \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits\"\n",
    "            \" before being passed to the model.\"\n",
    "        )\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_names = raw_datasets[\"train\"].column_names\n",
    "# text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
    "text_column_name = \"content\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "82c09dbdfa1a47d79607a4c9729fb286",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Token indices sequence length is longer than the specified maximum sequence length for this model (7725 > 1024). Running this sequence through the model will result in indexing errors\n",
      "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model.\n"
     ]
    }
   ],
   "source": [
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize_function,\n",
    "    batched=True,\n",
    "    # num_proc=data_args.preprocessing_num_workers,\n",
    "    remove_columns=column_names,\n",
    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
    "    desc=\"Running tokenizer on dataset\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "block_size = tokenizer.model_max_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n",
    "def group_texts(examples):\n",
    "    # Concatenate all texts.\n",
    "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
    "    # customize this part to your needs.\n",
    "    if total_length >= block_size:\n",
    "        total_length = (total_length // block_size) * block_size\n",
    "    # Split by chunks of max_len.\n",
    "    result = {\n",
    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ca2f64461e304df6aecb16e8cfcd42ac",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Grouping texts in chunks of 1024:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "lm_datasets = tokenized_datasets.map(\n",
    "    group_texts,\n",
    "    batched=True,\n",
    "    # num_proc=data_args.preprocessing_num_workers,\n",
    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = lm_datasets[\"train\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Do the fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize our Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    # args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    # eval_dataset=eval_dataset,\n",
    "    tokenizer=tokenizer,\n",
    "    # Data collator will default to DataCollatorWithPadding, so we change it.\n",
    "    data_collator=default_data_collator,\n",
    "    # compute_metrics=compute_metrics\n",
    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
    "    # else None,\n",
    "    # preprocess_logits_for_metrics=preprocess_logits_for_metrics\n",
    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
    "    # else None,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running training *****\n",
      "  Num examples = 171\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 8\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 66\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "59ebc6f251bd42e4bd3474b574614d1f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/66 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Saving model checkpoint to tmp_trainer\n",
      "Configuration saved in tmp_trainer/config.json\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 2967.2818, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 4.249474265358665, 'epoch': 3.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Model weights saved in tmp_trainer/pytorch_model.bin\n",
      "tokenizer config file saved in tmp_trainer/tokenizer_config.json\n",
      "Special tokens file saved in tmp_trainer/special_tokens_map.json\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "***** train metrics *****\n",
      "  epoch                    =        3.0\n",
      "  train_loss               =     4.2495\n",
      "  train_runtime            = 0:49:27.28\n",
      "  train_samples            =        171\n",
      "  train_samples_per_second =      0.173\n",
      "  train_steps_per_second   =      0.022\n"
     ]
    }
   ],
   "source": [
    "# Training\n",
    "checkpoint = None\n",
    "train_result = trainer.train(resume_from_checkpoint=checkpoint)\n",
    "trainer.save_model()  # Saves the tokenizer too for easy upload\n",
    "\n",
    "metrics = train_result.metrics\n",
    "\n",
    "max_train_samples = (len(train_dataset))\n",
    "metrics[\"train_samples\"] = min(max_train_samples, len(train_dataset))\n",
    "\n",
    "trainer.log_metrics(\"train\", metrics)\n",
    "trainer.save_metrics(\"train\", metrics)\n",
    "trainer.save_state()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.6 ('augmented_poetry')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "00664817f4a09ab74dd392ee5a8d12e3606381c26df296db9ea5c334bb5d1b65"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}