{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "import transformers\n", "from transformers import (\n", " CONFIG_MAPPING,\n", " MODEL_FOR_CAUSAL_LM_MAPPING,\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " HfArgumentParser,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " is_torch_tpu_available,\n", " set_seed,\n", ")\n", "\n", "from itertools import chain\n", "\n", "from transformers.testing_utils import CaptureLogger\n", "from transformers.trainer_utils import get_last_checkpoint\n", "# from transformers.utils import check_min_version, send_example_telemetry\n", "from transformers.utils.versions import require_version\n", "\n", "import datasets\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "ImportError", "evalue": "This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers.", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcheck_min_version\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m4.23.0.dev0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/transformers/utils/__init__.py:32\u001b[0m, in \u001b[0;36mcheck_min_version\u001b[0;34m(min_version)\u001b[0m\n\u001b[1;32m 30\u001b[0m error_message \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis example requires a minimum version of \u001b[39m\u001b[39m{\u001b[39;00mmin_version\u001b[39m}\u001b[39;00m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 31\u001b[0m error_message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m but the version found is \u001b[39m\u001b[39m{\u001b[39;00m__version__\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mImportError\u001b[39;00m(\n\u001b[1;32m 33\u001b[0m error_message\n\u001b[1;32m 34\u001b[0m \u001b[39m+\u001b[39m (\n\u001b[1;32m 35\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 36\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mversions of HuggingFace Transformers.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 37\u001b[0m )\n\u001b[1;32m 38\u001b[0m )\n", "\u001b[0;31mImportError\u001b[0m: This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers." ] } ], "source": [ "# check_min_version(\"4.23.0.dev0\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "require_version(\"datasets>=1.8.0\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "set_seed(37)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset csv/merve--poetry to /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ed56ee6b324647798b19ac7bf5accc40", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/1 [00:00 1024). Running this sequence through the model will result in indexing errors\n", "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model.\n" ] } ], "source": [ "tokenized_datasets = raw_datasets.map(\n", " tokenize_function,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " remove_columns=column_names,\n", " # load_from_cache_file=not data_args.overwrite_cache,\n", " desc=\"Running tokenizer on dataset\",\n", ")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "block_size = tokenizer.model_max_length" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n", "def group_texts(examples):\n", " # Concatenate all texts.\n", " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n", " total_length = len(concatenated_examples[list(examples.keys())[0]])\n", " # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n", " # customize this part to your needs.\n", " if total_length >= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of max_len.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ca2f64461e304df6aecb16e8cfcd42ac", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Grouping texts in chunks of 1024: 0%| | 0/1 [00:00