File size: 61,342 Bytes

78a33e3

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "MOsHUjgdIrIW",
    "outputId": "f84a093e-147f-470e-aad9-80fb51193c8e",
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (3.2.0)\n",
      "Requirement already satisfied: transformers[sentencepiece] in /opt/conda/lib/python3.10/site-packages (4.48.1)\n",
      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets) (3.13.1)\n",
      "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.26.3)\n",
      "Requirement already satisfied: pyarrow>=15.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (19.0.0)\n",
      "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.8)\n",
      "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.2.3)\n",
      "Requirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.32.3)\n",
      "Requirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.67.1)\n",
      "Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.5.0)\n",
      "Requirement already satisfied: multiprocess<0.70.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.16)\n",
      "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2023.12.2)\n",
      "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets) (3.11.11)\n",
      "Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.27.1)\n",
      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0.1)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (2024.11.6)\n",
      "Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.21.0)\n",
      "Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.5.2)\n",
      "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.2.0)\n",
      "Requirement already satisfied: protobuf in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (5.29.3)\n",
      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.4)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.2)\n",
      "Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (23.1.0)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n",
      "Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.1)\n",
      "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.3)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.23.0->datasets) (4.9.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2023.11.17)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n",
      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3.post1)\n",
      "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
      "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
     ]
    }
   ],
   "source": [
    "!pip install datasets transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sun Jan 26 12:49:45 2025       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 550.76                 Driver Version: 550.76         CUDA Version: 12.4     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 4090        On  |   00000000:E2:00.0 Off |                  Off |\n",
      "|  0%   31C    P8             19W /  450W |       1MiB /  24564MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|  No running processes found                                                             |\n",
      "+-----------------------------------------------------------------------------------------+\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HFASsisvIrIb"
   },
   "source": [
    "If you're opening this notebook locally, make sure your environment has an install from the last version of Datasets and a source install of Transformers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
      "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
     ]
    }
   ],
   "source": [
    "!pip install huggingface_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "!git config --global credential.helper store"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting a corpus"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will need texts to train our tokenizer. We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download our text data, which can be easily done with the `load_dataset` function:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5af05419ecdb43f9933ce463de99f18a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_dataset(\"openpecha/deduplication_combined_word_seg_data\", name=\"\", split=\"train\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "manual_dataset = dataset.filter(lambda x: x[\"filename\"] == \"manual_data.json\", num_proc=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'source': 'ད་ལྟར་བ་ཡོད་ཅི་ཞིག་མེད། །གང་གི་དུས་ཀུན་ཡོད་ཉིད་པ། །དེ་ཡི་མི་རྟག་ཉིད་གང་ལས། ། འདས་པ་ལས་ནི་འདས་གྱུར་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། ། འདས་པ་ལས་ནི་མ་འདས་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། །གལ་ཏེ་མ་འོངས་སྐྱེས་ཡོད་ན། །ཇི་ལྟར་ད་ལྟར་བར་མི་འགྱུར། །ཅི་སྟེ་དེ་ལ་སྐྱེ་མེད་ན། །མ་འོངས་རྟག་པར་འགྱུར་རམ་ཅི། ། སྐྱེ་བ་མེད་ཀྱང་འཇིག་པ་ལས། །གལ་ཏེ་མ་འོངས་མི་རྟག་ན། །འདས་ལ་འཇིག་པ་ཡོད་མིན་ཏེ། །དེ་ནི་རྟག་པར་ཅིས་མི་རྟོག། འདས་པ་དང་ནི་ད་ལྟར་བ། །འདི་ནི་མི་རྟག་འགྱུར་མིན་ལ། །',\n",
       " 'target': 'ད་ལྟར་ བ་ ཡོད་ ཅི་ཞིག་ མེད ། ། གང་ གི་ དུས་ ཀུན་ ཡོད་ ཉིད་པ ། ། དེ་ ཡི་ མི་ རྟག་ ཉིད་ གང་ ལས ། ། འདས་པ་ ལས་ ནི་ འདས་ གྱུར་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། འདས་པ་ ལས་ ནི་ མ་ འདས་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། གལ་ཏེ་ མ་འོངས་ སྐྱེས་ ཡོད་ ན ། ། ཇི་ལྟར་ ད་ལྟར་བ ར་མི་ འགྱུར ། ། ཅི་སྟེ་ དེ་ ལ་ སྐྱེ་ མེད་ ན ། ། མ་འོངས་ རྟག་པ ར་ འགྱུར་ རམ་ ཅི ། ། སྐྱེ་བ་ མེད་ ཀྱང་ འཇིག་པ་ ལས ། ། གལ་ཏེ་ མ་འོངས་ མི་ རྟག་ ན ། ། འདས་ ལ་ འཇིག་པ་ ཡོད་ མིན་ ཏེ ། ། དེ་ ནི་ རྟག་པ ར་ ཅི ས་ མི་ རྟོག ། འདས་པ་ དང་ ནི་ ད་ལྟ ར ་བ ། ། འདི་ ནི་ མི་ རྟག་ འགྱུར་ མིན་ ལ ། །',\n",
       " 'filename': 'manual_data.json'}"
      ]
     },
     "execution_count": 221,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "manual_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20278"
      ]
     },
     "execution_count": 232,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(manual_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [],
   "source": [
    "remaining_dataset = dataset.filter(lambda x: x[\"filename\"] != \"manual_data.json\", num_proc=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'source': ['གཙོ་མོའི་མགྲིན་པར་ཨྃ་དམར་པོ་འབར་བ་ལ་སེམས་གཟུང་༔'],\n",
       " 'target': ['གཙོ་ མོ འི་ མགྲིན་པ ར་ ཨྃ་ དམར་པོ་ འབར་བ་ ལ་ སེམས་ གཟུང་ ༔'],\n",
       " 'filename': ['UT3JT13384-005-0028.txt']}"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remaining_dataset[9:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Unigram model like Albert"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's now have a look at how we can create a Unigram tokenizer like the one used for training T5. The first step is to create a `Tokenizer` with an empty `Unigram` model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: tokenizers in /opt/conda/lib/python3.10/site-packages (0.21.0)\n",
      "Requirement already satisfied: icecream in /opt/conda/lib/python3.10/site-packages (2.1.4)\n",
      "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/conda/lib/python3.10/site-packages (from tokenizers) (0.27.1)\n",
      "Requirement already satisfied: colorama>=0.3.9 in /opt/conda/lib/python3.10/site-packages (from icecream) (0.4.6)\n",
      "Requirement already satisfied: pygments>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.15.1)\n",
      "Requirement already satisfied: executing>=2.1.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.2.0)\n",
      "Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.0.5)\n",
      "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from asttokens>=2.0.1->icecream) (1.16.0)\n",
      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.12.2)\n",
      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2.32.3)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
      "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
     ]
    }
   ],
   "source": [
    "!pip install tokenizers icecream"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import Tokenizer, decoders\n",
    "from tokenizers.models import Unigram\n",
    "from tokenizers import pre_tokenizers\n",
    "from tokenizers.pre_tokenizers import WhitespaceSplit\n",
    "from tokenizers import trainers\n",
    "from icecream import ic\n",
    "\n",
    "tokenizer = Tokenizer(Unigram())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batch_iterator(dataset):\n",
    "    for i in range(0, len(dataset), batch_size):\n",
    "        yield dataset[i : i + batch_size][\"target\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we want to have a quick look at how it preprocesses the inputs, we can call the `pre_tokenize_str` method:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vocab_count=32000\n",
    "tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = trainers.UnigramTrainer(vocab_size=vocab_count, special_tokens=[\"[CLS]\", \"[SEP]\", \"<unk>\", \"<pad>\", \"[MASK]\"], unk_token=\"<unk>\")\n",
    "tokenizer.train_from_iterator(batch_iterator(manual_dataset), trainer=trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(f\"./trained_tokenizer_{vocab_count}.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the saved tokenizer\n",
    "tokenizer = Tokenizer.from_file(f\"./trained_tokenizer_{vocab_count}.json\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
    "sep_token_id = tokenizer.token_to_id(\"[SEP]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import processors\n",
    "from tokenizers import Tokenizer, models, processors, decoders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.post_processor = processors.TemplateProcessing(\n",
    "    single=\"[CLS]:0 $A:0 [SEP]:0\",\n",
    "    pair=\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
    "    special_tokens=[\n",
    "        (\"[CLS]\", cls_token_id),\n",
    "        (\"[SEP]\", sep_token_id),\n",
    "    ],\n",
    ")\n",
    "tokenizer.decoder = decoders.CTC()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer_8000 = Tokenizer.from_file(f\"./trained_tokenizer_8000.json\")\n",
    "tokenizer_16000 = Tokenizer.from_file(f\"./trained_tokenizer_16000.json\")\n",
    "tokenizer_32000 = Tokenizer.from_file(f\"./trained_tokenizer_32000.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AlbertTokenizerFast\n",
    "\n",
    "tokenizer_8000 = AlbertTokenizerFast(tokenizer_object=tokenizer_8000)\n",
    "tokenizer_16000 = AlbertTokenizerFast(tokenizer_object=tokenizer_16000)\n",
    "tokenizer_32000 = AlbertTokenizerFast(tokenizer_object=tokenizer_32000)\n",
    "##tokenizer_64000 = AlbertTokenizerFast(tokenizer_object=tokenizer_64000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'source': 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔',\n",
       " 'target': 'རྨི་ལམ་ ཡིན་ སྙམ་ དུ་ བསམ ༔',\n",
       " 'filename': 'UT3JT13384-005-0028.txt'}"
      ]
     },
     "execution_count": 237,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remaining_dataset[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ic| data[\"source\"]: 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔'\n",
      "ic| tokenized_data_8000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
      "ic| tokenized_data_16000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
      "ic| tokenized_data_32000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
      "ic| data[\"source\"]: 'ཉམས་སྐྱེས་པ་ན་རླུང་སེམས་དྲག་ཏུ་གཅུན༔'\n",
      "ic| tokenized_data_8000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུ', 'ན', '༔']\n",
      "ic| tokenized_data_16000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n",
      "ic| tokenized_data_32000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n"
     ]
    }
   ],
   "source": [
    "for index in range(10, len(remaining_dataset)):\n",
    "    data = remaining_dataset[index]\n",
    "    if index == 12:\n",
    "        break\n",
    "    ic(data[\"source\"])\n",
    "    tokenized_data_8000 = tokenizer_8000.tokenize(data[\"source\"])\n",
    "    ic(tokenized_data_8000)\n",
    "    tokenized_data_16000 = tokenizer_16000.tokenize(data[\"source\"])\n",
    "    ic(tokenized_data_16000)\n",
    "    tokenized_data_32000 = tokenizer_32000.tokenize(data[\"source\"])\n",
    "    ic(tokenized_data_32000)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "ic| data: '༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།'\n",
      "ic| tokenized_data_8000: ['༸གོང་ས་',\n",
      "                          'མཆོག་',\n",
      "                          'གི་',\n",
      "                          'བོད་',\n",
      "                          'དོན་',\n",
      "                          'འཐབ་རྩོད་',\n",
      "                          'དང་',\n",
      "                          'འབྲེལ་བ',\n",
      "                          'འི་',\n",
      "                          'ཕྱག་',\n",
      "                          'དེབ་',\n",
      "                          'གསར་པ་',\n",
      "                          'ཞིག་',\n",
      "                          'ཕྱི་',\n",
      "                          'ཟླ་',\n",
      "                          'གསུམ་པ',\n",
      "                          'འི་',\n",
      "                          'ནང་',\n",
      "                          'འདོན་',\n",
      "                          'སྤེལ་',\n",
      "                          'གནང་',\n",
      "                          'རྒྱུ',\n",
      "                          '།']\n",
      "ic| tokenized_data_16000: ['༸གོང་ས་',\n",
      "                           'མཆོག་',\n",
      "                           'གི་',\n",
      "                           'བོད་',\n",
      "                           'དོན་',\n",
      "                           'འཐབ་རྩོད་',\n",
      "                           'དང་',\n",
      "                           'འབྲེལ་བ',\n",
      "                           'འི་',\n",
      "                           'ཕྱག་',\n",
      "                           'དེབ་',\n",
      "                           'གསར་པ་',\n",
      "                           'ཞིག་',\n",
      "                           'ཕྱི་ཟླ',\n",
      "                           '་',\n",
      "                           'གསུམ་པ',\n",
      "                           'འི་',\n",
      "                           'ནང་',\n",
      "                           'འདོན་',\n",
      "                           'སྤེལ་',\n",
      "                           'གནང་',\n",
      "                           'རྒྱུ',\n",
      "                           '།']\n",
      "ic| tokenized_data_32000: ['༸གོང་ས་',\n",
      "                           'མཆོག་',\n",
      "                           'གི་',\n",
      "                           'བོད་',\n",
      "                           'དོན་',\n",
      "                           'འཐབ་རྩོད་',\n",
      "                           'དང་',\n",
      "                           'འབྲེལ་བ',\n",
      "                           'འི་',\n",
      "                           'ཕྱག་',\n",
      "                           'དེབ་',\n",
      "                           'གསར་པ་',\n",
      "                           'ཞིག་',\n",
      "                           'ཕྱི་ཟླ',\n",
      "                           '་',\n",
      "                           'གསུམ་པ',\n",
      "                           'འི་',\n",
      "                           'ནང་',\n",
      "                           'འདོན་',\n",
      "                           'སྤེལ་',\n",
      "                           'གནང་',\n",
      "                           'རྒྱུ',\n",
      "                           '།']\n",
      "ic| tokenizer_8000.encode(data): [0,\n",
      "                                  2163,\n",
      "                                  152,\n",
      "                                  25,\n",
      "                                  201,\n",
      "                                  47,\n",
      "                                  3426,\n",
      "                                  9,\n",
      "                                  662,\n",
      "                                  7,\n",
      "                                  267,\n",
      "                                  1522,\n",
      "                                  2426,\n",
      "                                  59,\n",
      "                                  256,\n",
      "                                  636,\n",
      "                                  348,\n",
      "                                  7,\n",
      "                                  85,\n",
      "                                  1067,\n",
      "                                  1238,\n",
      "                                  717,\n",
      "                                  246,\n",
      "                                  5,\n",
      "                                  1]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0,\n",
       " 2163,\n",
       " 152,\n",
       " 25,\n",
       " 201,\n",
       " 47,\n",
       " 3426,\n",
       " 9,\n",
       " 662,\n",
       " 7,\n",
       " 267,\n",
       " 1522,\n",
       " 2426,\n",
       " 59,\n",
       " 256,\n",
       " 636,\n",
       " 348,\n",
       " 7,\n",
       " 85,\n",
       " 1067,\n",
       " 1238,\n",
       " 717,\n",
       " 246,\n",
       " 5,\n",
       " 1]"
      ]
     },
     "execution_count": 240,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = \"༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།\"\n",
    "ic(data) \n",
    "tokenized_data_8000 = tokenizer_8000.tokenize(data)\n",
    "ic(tokenized_data_8000)\n",
    "tokenized_data_16000 = tokenizer_16000.tokenize(data)\n",
    "ic(tokenized_data_16000)\n",
    "tokenized_data_32000 = tokenizer_32000.tokenize(data)\n",
    "ic(tokenized_data_32000)\n",
    "ic(tokenizer_8000.encode(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_data_16000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenized_data_32000"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use your new tokenizer to train a language model!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can either use your new tokenizer in the language modeling from scratch notebook [Link to come] or use the `--tokenizer_name` argument in the [language modeling scripts](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) to use it there to train a model from scratch."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a22f05e792ee40b79bf097340ae38a2a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "notebook_login()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [
    {
     "ename": "HfHubHTTPError",
     "evalue": "409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:406\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    405\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 406\u001b[0m     \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    407\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/requests/models.py:1024\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1023\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1024\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
      "\u001b[0;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create",
      "\nThe above exception was the direct cause of the following exception:\n",
      "\u001b[0;31mHfHubHTTPError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[253], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m login, create_repo, Repository\n\u001b[1;32m      3\u001b[0m repo_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mta4tsering/NLP-Unigram_language_model_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m repo_url \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_repo\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprivate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m      5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository created: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m    112\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py:3525\u001b[0m, in \u001b[0;36mHfApi.create_repo\u001b[0;34m(self, repo_id, token, private, repo_type, exist_ok, resource_group_id, space_sdk, space_hardware, space_storage, space_sleep_time, space_secrets, space_variables)\u001b[0m\n\u001b[1;32m   3522\u001b[0m     \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m   3524\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3525\u001b[0m     \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3526\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m   3527\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m exist_ok \u001b[38;5;129;01mand\u001b[39;00m err\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m409\u001b[39m:\n\u001b[1;32m   3528\u001b[0m         \u001b[38;5;66;03m# Repo already exists and `exist_ok=True`\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:477\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    473\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m    475\u001b[0m \u001b[38;5;66;03m# Convert `HTTPError` into a `HfHubHTTPError` to display request information\u001b[39;00m\n\u001b[1;32m    476\u001b[0m \u001b[38;5;66;03m# as well (request id and/or server error message)\u001b[39;00m\n\u001b[0;32m--> 477\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, \u001b[38;5;28mstr\u001b[39m(e), response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
      "\u001b[0;31mHfHubHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import login, create_repo, Repository\n",
    "\n",
    "repo_name = \"ta4tsering/NLP-Unigram_language_model_tokenizer\"\n",
    "repo_url = create_repo(repo_name, repo_type=\"model\", private=False)\n",
    "print(f\"Repository created: {repo_url}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
      "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
     ]
    }
   ],
   "source": [
    "!pip install --upgrade huggingface_hub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\n",
      "For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n",
      "  warnings.warn(warning_message, FutureWarning)\n"
     ]
    },
    {
     "ename": "OSError",
     "evalue": "Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:592\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    591\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 592\u001b[0m     lfs_version \u001b[38;5;241m=\u001b[39m \u001b[43mrun_subprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit-lfs --version\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocal_dir\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_subprocess.py:83\u001b[0m, in \u001b[0;36mrun_subprocess\u001b[0;34m(command, folder, check, **kwargs)\u001b[0m\n\u001b[1;32m     81\u001b[0m     folder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(folder)\n\u001b[0;32m---> 83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstderr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstdout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcheck\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     88\u001b[0m \u001b[43m    \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     89\u001b[0m \u001b[43m    \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreplace\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# if not utf-8, replace char by �\u001b[39;49;00m\n\u001b[1;32m     90\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfolder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetcwd\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     91\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     92\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:503\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m    501\u001b[0m     kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstderr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m PIPE\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpopenargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m    504\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:971\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)\u001b[0m\n\u001b[1;32m    968\u001b[0m             \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mTextIOWrapper(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr,\n\u001b[1;32m    969\u001b[0m                     encoding\u001b[38;5;241m=\u001b[39mencoding, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m--> 971\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_child\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecutable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpreexec_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclose_fds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    972\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mpass_fds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    973\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mstartupinfo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreationflags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    974\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mp2cread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp2cwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    975\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mc2pread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mc2pwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    976\u001b[0m \u001b[43m                        \u001b[49m\u001b[43merrread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    977\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mrestore_signals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    978\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mgid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mumask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    979\u001b[0m \u001b[43m                        \u001b[49m\u001b[43mstart_new_session\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    980\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m    981\u001b[0m     \u001b[38;5;66;03m# Cleanup if the child failed starting.\u001b[39;00m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:1863\u001b[0m, in \u001b[0;36mPopen._execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m   1862\u001b[0m         err_msg \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mstrerror(errno_num)\n\u001b[0;32m-> 1863\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(errno_num, err_msg, err_filename)\n\u001b[1;32m   1864\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(err_msg)\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'git-lfs'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[256], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m repo \u001b[38;5;241m=\u001b[39m \u001b[43mRepository\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclone_from\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_url\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m    112\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:132\u001b[0m, in \u001b[0;36m_deprecate_method.<locals>._inner_deprecate_method.<locals>.inner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    130\u001b[0m     warning_message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m message\n\u001b[1;32m    131\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(warning_message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:522\u001b[0m, in \u001b[0;36mRepository.__init__\u001b[0;34m(self, local_dir, clone_from, repo_type, token, git_user, git_email, revision, skip_lfs_files, client)\u001b[0m\n\u001b[1;32m    519\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mskip_lfs_files \u001b[38;5;241m=\u001b[39m skip_lfs_files\n\u001b[1;32m    520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m client \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m HfApi()\n\u001b[0;32m--> 522\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_git_versions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m    525\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhuggingface_token: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m token\n",
      "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:594\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    592\u001b[0m     lfs_version \u001b[38;5;241m=\u001b[39m run_subprocess(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit-lfs --version\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlocal_dir)\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m    593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n\u001b[0;32m--> 594\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m    595\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLooks like you do not have git-lfs installed, please install.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    596\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m You can install from https://git-lfs.github.com/.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    597\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Then run `git lfs install` (you only have to do this once).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    598\u001b[0m     )\n\u001b[1;32m    599\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(git_version \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m lfs_version)\n",
      "\u001b[0;31mOSError\u001b[0m: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once)."
     ]
    }
   ],
   "source": [
    "repo_path = \"/home\"\n",
    "repo = Repository(local_dir=repo_path, clone_from=repo_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from huggingface_hub import upload_file\n",
    "\n",
    "# Define the local folder and repo_id\n",
    "folder_path = \"/home/NLP-Unigram_language_model_tokenizer/\"  # Local folder path (should match your repo_id)\n",
    "repo_id = \"ta4tsering/NLP-Unigram_language_model_tokenizer\"  # Replace with your Hugging Face repo ID\n",
    "\n",
    "# Iterate through all files in the folder\n",
    "for root, _, files in os.walk(folder_path):\n",
    "    for file_name in files:\n",
    "        local_file_path = os.path.join(root, file_name)\n",
    "        repo_file_path = os.path.relpath(local_file_path, folder_path)  # Keep folder structure\n",
    "\n",
    "        # Upload file to the repo\n",
    "        upload_file(\n",
    "            path_or_fileobj=local_file_path,\n",
    "            path_in_repo=repo_file_path,\n",
    "            repo_id=repo_id,\n",
    "            repo_type=\"model\",\n",
    "            commit_message=f\"Add {repo_file_path}\",\n",
    "        )\n",
    "        print(f\"Uploaded {repo_file_path}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "git: 'lfs' is not a git command. See 'git --help'.\n",
      "\n",
      "The most similar command is\n",
      "\tlog\n"
     ]
    }
   ],
   "source": [
    "!git lfs install"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "repo.push_to_hub(commit_message=\"Initial commit\")\n",
    "print(\"Files pushed to Hugging Face!\")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Train your tokenizer",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}