{ "cells": [ { "cell_type": "code", "execution_count": 213, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "MOsHUjgdIrIW", "outputId": "f84a093e-147f-470e-aad9-80fb51193c8e", "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (3.2.0)\n", "Requirement already satisfied: transformers[sentencepiece] in /opt/conda/lib/python3.10/site-packages (4.48.1)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets) (3.13.1)\n", "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.26.3)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (19.0.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.2.3)\n", "Requirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.67.1)\n", "Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.5.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2023.12.2)\n", "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets) (3.11.11)\n", "Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.27.1)\n", "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (2024.11.6)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.21.0)\n", "Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.5.2)\n", "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.2.0)\n", "Requirement already satisfied: protobuf in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (5.29.3)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.4)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.2)\n", "Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n", "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.3)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.23.0->datasets) (4.9.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2023.11.17)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3.post1)\n", "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n" ] } ], "source": [ "!pip install datasets transformers[sentencepiece]" ] }, { "cell_type": "code", "execution_count": 214, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sun Jan 26 12:49:45 2025 \n", "+-----------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 550.76 Driver Version: 550.76 CUDA Version: 12.4 |\n", "|-----------------------------------------+------------------------+----------------------+\n", "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|=========================================+========================+======================|\n", "| 0 NVIDIA GeForce RTX 4090 On | 00000000:E2:00.0 Off | Off |\n", "| 0% 31C P8 19W / 450W | 1MiB / 24564MiB | 0% Default |\n", "| | | N/A |\n", "+-----------------------------------------+------------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=========================================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "markdown", "metadata": { "id": "HFASsisvIrIb" }, "source": [ "If you're opening this notebook locally, make sure your environment has an install from the last version of Datasets and a source install of Transformers." ] }, { "cell_type": "code", "execution_count": 215, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n", "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n", "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n" ] } ], "source": [ "!pip install huggingface_hub" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [], "source": [ "!git config --global credential.helper store" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Getting a corpus" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will need texts to train our tokenizer. We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download our text data, which can be easily done with the `load_dataset` function:" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5af05419ecdb43f9933ce463de99f18a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
=0.16.4 in /opt/conda/lib/python3.10/site-packages (from tokenizers) (0.27.1)\n", "Requirement already satisfied: colorama>=0.3.9 in /opt/conda/lib/python3.10/site-packages (from icecream) (0.4.6)\n", "Requirement already satisfied: pygments>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.15.1)\n", "Requirement already satisfied: executing>=2.1.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.2.0)\n", "Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.0.5)\n", "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from asttokens>=2.0.1->icecream) (1.16.0)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.12.2)\n", "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n", "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n", "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n" ] } ], "source": [ "!pip install tokenizers icecream" ] }, { "cell_type": "code", "execution_count": 225, "metadata": {}, "outputs": [], "source": [ "from tokenizers import Tokenizer, decoders\n", "from tokenizers.models import Unigram\n", "from tokenizers import pre_tokenizers\n", "from tokenizers.pre_tokenizers import WhitespaceSplit\n", "from tokenizers import trainers\n", "from icecream import ic\n", "\n", "tokenizer = Tokenizer(Unigram())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def batch_iterator(dataset):\n", " for i in range(0, len(dataset), batch_size):\n", " yield dataset[i : i + batch_size][\"target\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we want to have a quick look at how it preprocesses the inputs, we can call the `pre_tokenize_str` method:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab_count=32000\n", "tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer = trainers.UnigramTrainer(vocab_size=vocab_count, special_tokens=[\"[CLS]\", \"[SEP]\", \"\", \"\", \"[MASK]\"], unk_token=\"\")\n", "tokenizer.train_from_iterator(batch_iterator(manual_dataset), trainer=trainer)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer.save(f\"./trained_tokenizer_{vocab_count}.json\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the saved tokenizer\n", "tokenizer = Tokenizer.from_file(f\"./trained_tokenizer_{vocab_count}.json\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n", "sep_token_id = tokenizer.token_to_id(\"[SEP]\")" ] }, { "cell_type": "code", "execution_count": 226, "metadata": {}, "outputs": [], "source": [ "from tokenizers import processors\n", "from tokenizers import Tokenizer, models, processors, decoders" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 234, "metadata": {}, "outputs": [], "source": [ "tokenizer.post_processor = processors.TemplateProcessing(\n", " single=\"[CLS]:0 $A:0 [SEP]:0\",\n", " pair=\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n", " special_tokens=[\n", " (\"[CLS]\", cls_token_id),\n", " (\"[SEP]\", sep_token_id),\n", " ],\n", ")\n", "tokenizer.decoder = decoders.CTC()" ] }, { "cell_type": "code", "execution_count": 235, "metadata": {}, "outputs": [], "source": [ "tokenizer_8000 = Tokenizer.from_file(f\"./trained_tokenizer_8000.json\")\n", "tokenizer_16000 = Tokenizer.from_file(f\"./trained_tokenizer_16000.json\")\n", "tokenizer_32000 = Tokenizer.from_file(f\"./trained_tokenizer_32000.json\")" ] }, { "cell_type": "code", "execution_count": 236, "metadata": {}, "outputs": [], "source": [ "from transformers import AlbertTokenizerFast\n", "\n", "tokenizer_8000 = AlbertTokenizerFast(tokenizer_object=tokenizer_8000)\n", "tokenizer_16000 = AlbertTokenizerFast(tokenizer_object=tokenizer_16000)\n", "tokenizer_32000 = AlbertTokenizerFast(tokenizer_object=tokenizer_32000)\n", "##tokenizer_64000 = AlbertTokenizerFast(tokenizer_object=tokenizer_64000)" ] }, { "cell_type": "code", "execution_count": 237, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'source': 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔',\n", " 'target': 'རྨི་ལམ་ ཡིན་ སྙམ་ དུ་ བསམ ༔',\n", " 'filename': 'UT3JT13384-005-0028.txt'}" ] }, "execution_count": 237, "metadata": {}, "output_type": "execute_result" } ], "source": [ "remaining_dataset[10]" ] }, { "cell_type": "code", "execution_count": 231, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "ic| data[\"source\"]: 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔'\n", "ic| tokenized_data_8000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n", "ic| tokenized_data_16000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n", "ic| tokenized_data_32000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n", "ic| data[\"source\"]: 'ཉམས་སྐྱེས་པ་ན་རླུང་སེམས་དྲག་ཏུ་གཅུན༔'\n", "ic| tokenized_data_8000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུ', 'ན', '༔']\n", "ic| tokenized_data_16000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n", "ic| tokenized_data_32000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n" ] } ], "source": [ "for index in range(10, len(remaining_dataset)):\n", " data = remaining_dataset[index]\n", " if index == 12:\n", " break\n", " ic(data[\"source\"])\n", " tokenized_data_8000 = tokenizer_8000.tokenize(data[\"source\"])\n", " ic(tokenized_data_8000)\n", " tokenized_data_16000 = tokenizer_16000.tokenize(data[\"source\"])\n", " ic(tokenized_data_16000)\n", " tokenized_data_32000 = tokenizer_32000.tokenize(data[\"source\"])\n", " ic(tokenized_data_32000)\n", " \n" ] }, { "cell_type": "code", "execution_count": 240, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "ic| data: '༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།'\n", "ic| tokenized_data_8000: ['༸གོང་ས་',\n", " 'མཆོག་',\n", " 'གི་',\n", " 'བོད་',\n", " 'དོན་',\n", " 'འཐབ་རྩོད་',\n", " 'དང་',\n", " 'འབྲེལ་བ',\n", " 'འི་',\n", " 'ཕྱག་',\n", " 'དེབ་',\n", " 'གསར་པ་',\n", " 'ཞིག་',\n", " 'ཕྱི་',\n", " 'ཟླ་',\n", " 'གསུམ་པ',\n", " 'འི་',\n", " 'ནང་',\n", " 'འདོན་',\n", " 'སྤེལ་',\n", " 'གནང་',\n", " 'རྒྱུ',\n", " '།']\n", "ic| tokenized_data_16000: ['༸གོང་ས་',\n", " 'མཆོག་',\n", " 'གི་',\n", " 'བོད་',\n", " 'དོན་',\n", " 'འཐབ་རྩོད་',\n", " 'དང་',\n", " 'འབྲེལ་བ',\n", " 'འི་',\n", " 'ཕྱག་',\n", " 'དེབ་',\n", " 'གསར་པ་',\n", " 'ཞིག་',\n", " 'ཕྱི་ཟླ',\n", " '་',\n", " 'གསུམ་པ',\n", " 'འི་',\n", " 'ནང་',\n", " 'འདོན་',\n", " 'སྤེལ་',\n", " 'གནང་',\n", " 'རྒྱུ',\n", " '།']\n", "ic| tokenized_data_32000: ['༸གོང་ས་',\n", " 'མཆོག་',\n", " 'གི་',\n", " 'བོད་',\n", " 'དོན་',\n", " 'འཐབ་རྩོད་',\n", " 'དང་',\n", " 'འབྲེལ་བ',\n", " 'འི་',\n", " 'ཕྱག་',\n", " 'དེབ་',\n", " 'གསར་པ་',\n", " 'ཞིག་',\n", " 'ཕྱི་ཟླ',\n", " '་',\n", " 'གསུམ་པ',\n", " 'འི་',\n", " 'ནང་',\n", " 'འདོན་',\n", " 'སྤེལ་',\n", " 'གནང་',\n", " 'རྒྱུ',\n", " '།']\n", "ic| tokenizer_8000.encode(data): [0,\n", " 2163,\n", " 152,\n", " 25,\n", " 201,\n", " 47,\n", " 3426,\n", " 9,\n", " 662,\n", " 7,\n", " 267,\n", " 1522,\n", " 2426,\n", " 59,\n", " 256,\n", " 636,\n", " 348,\n", " 7,\n", " 85,\n", " 1067,\n", " 1238,\n", " 717,\n", " 246,\n", " 5,\n", " 1]\n" ] }, { "data": { "text/plain": [ "[0,\n", " 2163,\n", " 152,\n", " 25,\n", " 201,\n", " 47,\n", " 3426,\n", " 9,\n", " 662,\n", " 7,\n", " 267,\n", " 1522,\n", " 2426,\n", " 59,\n", " 256,\n", " 636,\n", " 348,\n", " 7,\n", " 85,\n", " 1067,\n", " 1238,\n", " 717,\n", " 246,\n", " 5,\n", " 1]" ] }, "execution_count": 240, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = \"༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།\"\n", "ic(data) \n", "tokenized_data_8000 = tokenizer_8000.tokenize(data)\n", "ic(tokenized_data_8000)\n", "tokenized_data_16000 = tokenizer_16000.tokenize(data)\n", "ic(tokenized_data_16000)\n", "tokenized_data_32000 = tokenizer_32000.tokenize(data)\n", "ic(tokenized_data_32000)\n", "ic(tokenizer_8000.encode(data))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_data" ] }, { "cell_type": "code", "execution_count": 166, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']" ] }, "execution_count": 166, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_data_16000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenized_data_32000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Use your new tokenizer to train a language model!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can either use your new tokenizer in the language modeling from scratch notebook [Link to come] or use the `--tokenizer_name` argument in the [language modeling scripts](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) to use it there to train a model from scratch." ] }, { "cell_type": "code", "execution_count": 241, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a22f05e792ee40b79bf097340ae38a2a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
406\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/requests/models.py:1024\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1024\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n", "\u001b[0;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create", "\nThe above exception was the direct cause of the following exception:\n", "\u001b[0;31mHfHubHTTPError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[253], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m login, create_repo, Repository\n\u001b[1;32m 3\u001b[0m repo_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mta4tsering/NLP-Unigram_language_model_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m repo_url \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_repo\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprivate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository created: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py:3525\u001b[0m, in \u001b[0;36mHfApi.create_repo\u001b[0;34m(self, repo_id, token, private, repo_type, exist_ok, resource_group_id, space_sdk, space_hardware, space_storage, space_sleep_time, space_secrets, space_variables)\u001b[0m\n\u001b[1;32m 3522\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 3524\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3525\u001b[0m \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3526\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 3527\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m exist_ok \u001b[38;5;129;01mand\u001b[39;00m err\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m409\u001b[39m:\n\u001b[1;32m 3528\u001b[0m \u001b[38;5;66;03m# Repo already exists and `exist_ok=True`\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:477\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 475\u001b[0m \u001b[38;5;66;03m# Convert `HTTPError` into a `HfHubHTTPError` to display request information\u001b[39;00m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;66;03m# as well (request id and/or server error message)\u001b[39;00m\n\u001b[0;32m--> 477\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, \u001b[38;5;28mstr\u001b[39m(e), response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n", "\u001b[0;31mHfHubHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo" ] } ], "source": [ "from huggingface_hub import login, create_repo, Repository\n", "\n", "repo_name = \"ta4tsering/NLP-Unigram_language_model_tokenizer\"\n", "repo_url = create_repo(repo_name, repo_type=\"model\", private=False)\n", "print(f\"Repository created: {repo_url}\")" ] }, { "cell_type": "code", "execution_count": 248, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n", "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n", "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n", "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n", "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n", "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n" ] } ], "source": [ "!pip install --upgrade huggingface_hub" ] }, { "cell_type": "code", "execution_count": 256, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\n", "For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n", " warnings.warn(warning_message, FutureWarning)\n" ] }, { "ename": "OSError", "evalue": "Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:592\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m \u001b[43mrun_subprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit-lfs --version\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocal_dir\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_subprocess.py:83\u001b[0m, in \u001b[0;36mrun_subprocess\u001b[0;34m(command, folder, check, **kwargs)\u001b[0m\n\u001b[1;32m 81\u001b[0m folder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(folder)\n\u001b[0;32m---> 83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mstderr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43mstdout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreplace\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# if not utf-8, replace char by �\u001b[39;49;00m\n\u001b[1;32m 90\u001b[0m \u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfolder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetcwd\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 92\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:503\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 501\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstderr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m PIPE\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpopenargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:971\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mTextIOWrapper(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr,\n\u001b[1;32m 969\u001b[0m encoding\u001b[38;5;241m=\u001b[39mencoding, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m--> 971\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_child\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecutable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpreexec_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclose_fds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mpass_fds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartupinfo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreationflags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mp2cread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp2cwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[43mc2pread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mc2pwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43merrread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43mrestore_signals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mgid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mumask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mstart_new_session\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 980\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 981\u001b[0m \u001b[38;5;66;03m# Cleanup if the child failed starting.\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:1863\u001b[0m, in \u001b[0;36mPopen._execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m 1862\u001b[0m err_msg \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mstrerror(errno_num)\n\u001b[0;32m-> 1863\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(errno_num, err_msg, err_filename)\n\u001b[1;32m 1864\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(err_msg)\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'git-lfs'", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[256], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m repo \u001b[38;5;241m=\u001b[39m \u001b[43mRepository\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclone_from\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_url\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:132\u001b[0m, in \u001b[0;36m_deprecate_method.._inner_deprecate_method..inner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 130\u001b[0m warning_message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m message\n\u001b[1;32m 131\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(warning_message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:522\u001b[0m, in \u001b[0;36mRepository.__init__\u001b[0;34m(self, local_dir, clone_from, repo_type, token, git_user, git_email, revision, skip_lfs_files, client)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mskip_lfs_files \u001b[38;5;241m=\u001b[39m skip_lfs_files\n\u001b[1;32m 520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m client \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m HfApi()\n\u001b[0;32m--> 522\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_git_versions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhuggingface_token: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m token\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:594\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m run_subprocess(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit-lfs --version\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlocal_dir)\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m 595\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLooks like you do not have git-lfs installed, please install.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 596\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m You can install from https://git-lfs.github.com/.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Then run `git lfs install` (you only have to do this once).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 598\u001b[0m )\n\u001b[1;32m 599\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(git_version \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m lfs_version)\n", "\u001b[0;31mOSError\u001b[0m: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once)." ] } ], "source": [ "repo_path = \"/home\"\n", "repo = Repository(local_dir=repo_path, clone_from=repo_url)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "from huggingface_hub import upload_file\n", "\n", "# Define the local folder and repo_id\n", "folder_path = \"/home/NLP-Unigram_language_model_tokenizer/\" # Local folder path (should match your repo_id)\n", "repo_id = \"ta4tsering/NLP-Unigram_language_model_tokenizer\" # Replace with your Hugging Face repo ID\n", "\n", "# Iterate through all files in the folder\n", "for root, _, files in os.walk(folder_path):\n", " for file_name in files:\n", " local_file_path = os.path.join(root, file_name)\n", " repo_file_path = os.path.relpath(local_file_path, folder_path) # Keep folder structure\n", "\n", " # Upload file to the repo\n", " upload_file(\n", " path_or_fileobj=local_file_path,\n", " path_in_repo=repo_file_path,\n", " repo_id=repo_id,\n", " repo_type=\"model\",\n", " commit_message=f\"Add {repo_file_path}\",\n", " )\n", " print(f\"Uploaded {repo_file_path}\")\n" ] }, { "cell_type": "code", "execution_count": 257, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "git: 'lfs' is not a git command. See 'git --help'.\n", "\n", "The most similar command is\n", "\tlog\n" ] } ], "source": [ "!git lfs install" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "repo.push_to_hub(commit_message=\"Initial commit\")\n", "print(\"Files pushed to Hugging Face!\")" ] } ], "metadata": { "colab": { "name": "Train your tokenizer", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }