Add tokenizer_training.ipynb
Browse files- tokenizer_training.ipynb +992 -0
tokenizer_training.ipynb
ADDED
@@ -0,0 +1,992 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 213,
|
6 |
+
"metadata": {
|
7 |
+
"colab": {
|
8 |
+
"base_uri": "https://localhost:8080/",
|
9 |
+
"height": 1000
|
10 |
+
},
|
11 |
+
"id": "MOsHUjgdIrIW",
|
12 |
+
"outputId": "f84a093e-147f-470e-aad9-80fb51193c8e",
|
13 |
+
"scrolled": true
|
14 |
+
},
|
15 |
+
"outputs": [
|
16 |
+
{
|
17 |
+
"name": "stdout",
|
18 |
+
"output_type": "stream",
|
19 |
+
"text": [
|
20 |
+
"Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (3.2.0)\n",
|
21 |
+
"Requirement already satisfied: transformers[sentencepiece] in /opt/conda/lib/python3.10/site-packages (4.48.1)\n",
|
22 |
+
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets) (3.13.1)\n",
|
23 |
+
"Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.26.3)\n",
|
24 |
+
"Requirement already satisfied: pyarrow>=15.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (19.0.0)\n",
|
25 |
+
"Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.8)\n",
|
26 |
+
"Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.2.3)\n",
|
27 |
+
"Requirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.32.3)\n",
|
28 |
+
"Requirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.67.1)\n",
|
29 |
+
"Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.5.0)\n",
|
30 |
+
"Requirement already satisfied: multiprocess<0.70.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.16)\n",
|
31 |
+
"Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2023.12.2)\n",
|
32 |
+
"Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets) (3.11.11)\n",
|
33 |
+
"Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.27.1)\n",
|
34 |
+
"Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (23.1)\n",
|
35 |
+
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0.1)\n",
|
36 |
+
"Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (2024.11.6)\n",
|
37 |
+
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.21.0)\n",
|
38 |
+
"Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.5.2)\n",
|
39 |
+
"Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.2.0)\n",
|
40 |
+
"Requirement already satisfied: protobuf in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (5.29.3)\n",
|
41 |
+
"Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.4)\n",
|
42 |
+
"Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.2)\n",
|
43 |
+
"Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n",
|
44 |
+
"Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (23.1.0)\n",
|
45 |
+
"Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n",
|
46 |
+
"Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n",
|
47 |
+
"Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.1)\n",
|
48 |
+
"Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.3)\n",
|
49 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.23.0->datasets) (4.9.0)\n",
|
50 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n",
|
51 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4)\n",
|
52 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (1.26.18)\n",
|
53 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2023.11.17)\n",
|
54 |
+
"Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n",
|
55 |
+
"Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3.post1)\n",
|
56 |
+
"Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n",
|
57 |
+
"Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
|
58 |
+
"WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
|
59 |
+
]
|
60 |
+
}
|
61 |
+
],
|
62 |
+
"source": [
|
63 |
+
"!pip install datasets transformers[sentencepiece]"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
{
|
67 |
+
"cell_type": "code",
|
68 |
+
"execution_count": 214,
|
69 |
+
"metadata": {
|
70 |
+
"scrolled": true
|
71 |
+
},
|
72 |
+
"outputs": [
|
73 |
+
{
|
74 |
+
"name": "stdout",
|
75 |
+
"output_type": "stream",
|
76 |
+
"text": [
|
77 |
+
"Sun Jan 26 12:49:45 2025 \n",
|
78 |
+
"+-----------------------------------------------------------------------------------------+\n",
|
79 |
+
"| NVIDIA-SMI 550.76 Driver Version: 550.76 CUDA Version: 12.4 |\n",
|
80 |
+
"|-----------------------------------------+------------------------+----------------------+\n",
|
81 |
+
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
|
82 |
+
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
|
83 |
+
"| | | MIG M. |\n",
|
84 |
+
"|=========================================+========================+======================|\n",
|
85 |
+
"| 0 NVIDIA GeForce RTX 4090 On | 00000000:E2:00.0 Off | Off |\n",
|
86 |
+
"| 0% 31C P8 19W / 450W | 1MiB / 24564MiB | 0% Default |\n",
|
87 |
+
"| | | N/A |\n",
|
88 |
+
"+-----------------------------------------+------------------------+----------------------+\n",
|
89 |
+
" \n",
|
90 |
+
"+-----------------------------------------------------------------------------------------+\n",
|
91 |
+
"| Processes: |\n",
|
92 |
+
"| GPU GI CI PID Type Process name GPU Memory |\n",
|
93 |
+
"| ID ID Usage |\n",
|
94 |
+
"|=========================================================================================|\n",
|
95 |
+
"| No running processes found |\n",
|
96 |
+
"+-----------------------------------------------------------------------------------------+\n"
|
97 |
+
]
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"source": [
|
101 |
+
"!nvidia-smi"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "markdown",
|
106 |
+
"metadata": {
|
107 |
+
"id": "HFASsisvIrIb"
|
108 |
+
},
|
109 |
+
"source": [
|
110 |
+
"If you're opening this notebook locally, make sure your environment has an install from the last version of Datasets and a source install of Transformers."
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "code",
|
115 |
+
"execution_count": 215,
|
116 |
+
"metadata": {
|
117 |
+
"scrolled": true
|
118 |
+
},
|
119 |
+
"outputs": [
|
120 |
+
{
|
121 |
+
"name": "stdout",
|
122 |
+
"output_type": "stream",
|
123 |
+
"text": [
|
124 |
+
"Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
|
125 |
+
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
|
126 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
|
127 |
+
"Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
|
128 |
+
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
|
129 |
+
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
|
130 |
+
"Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
|
131 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
|
132 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
|
133 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
|
134 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
|
135 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
|
136 |
+
"WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
|
137 |
+
]
|
138 |
+
}
|
139 |
+
],
|
140 |
+
"source": [
|
141 |
+
"!pip install huggingface_hub"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "code",
|
146 |
+
"execution_count": 216,
|
147 |
+
"metadata": {},
|
148 |
+
"outputs": [],
|
149 |
+
"source": [
|
150 |
+
"!git config --global credential.helper store"
|
151 |
+
]
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"cell_type": "markdown",
|
155 |
+
"metadata": {},
|
156 |
+
"source": [
|
157 |
+
"## Getting a corpus"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "markdown",
|
162 |
+
"metadata": {},
|
163 |
+
"source": [
|
164 |
+
"We will need texts to train our tokenizer. We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download our text data, which can be easily done with the `load_dataset` function:"
|
165 |
+
]
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"cell_type": "code",
|
169 |
+
"execution_count": 217,
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [],
|
172 |
+
"source": [
|
173 |
+
"from datasets import load_dataset"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 218,
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [
|
181 |
+
{
|
182 |
+
"data": {
|
183 |
+
"application/vnd.jupyter.widget-view+json": {
|
184 |
+
"model_id": "5af05419ecdb43f9933ce463de99f18a",
|
185 |
+
"version_major": 2,
|
186 |
+
"version_minor": 0
|
187 |
+
},
|
188 |
+
"text/plain": [
|
189 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
190 |
+
]
|
191 |
+
},
|
192 |
+
"metadata": {},
|
193 |
+
"output_type": "display_data"
|
194 |
+
}
|
195 |
+
],
|
196 |
+
"source": [
|
197 |
+
"from huggingface_hub import notebook_login\n",
|
198 |
+
"\n",
|
199 |
+
"notebook_login()"
|
200 |
+
]
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"cell_type": "code",
|
204 |
+
"execution_count": 219,
|
205 |
+
"metadata": {},
|
206 |
+
"outputs": [],
|
207 |
+
"source": [
|
208 |
+
"dataset = load_dataset(\"openpecha/deduplication_combined_word_seg_data\", name=\"\", split=\"train\")"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": 220,
|
214 |
+
"metadata": {},
|
215 |
+
"outputs": [],
|
216 |
+
"source": [
|
217 |
+
"manual_dataset = dataset.filter(lambda x: x[\"filename\"] == \"manual_data.json\", num_proc=10)"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"cell_type": "code",
|
222 |
+
"execution_count": 221,
|
223 |
+
"metadata": {
|
224 |
+
"scrolled": true
|
225 |
+
},
|
226 |
+
"outputs": [
|
227 |
+
{
|
228 |
+
"data": {
|
229 |
+
"text/plain": [
|
230 |
+
"{'source': 'ད་ལྟར་བ་ཡོད་ཅི་ཞིག་མེད། །གང་གི་དུས་ཀུན་ཡོད་ཉིད་པ། །དེ་ཡི་མི་རྟག་ཉིད་གང་ལས། ། འདས་པ་ལས་ནི་འདས་གྱུར་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། ། འདས་པ་ལས་ནི་མ་འདས་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། །གལ་ཏེ་མ་འོངས་སྐྱེས་ཡོད་ན། །ཇི་ལྟར་ད་ལྟར་བར་མི་འགྱུར། །ཅི་སྟེ་དེ་ལ་སྐྱེ་མེད་ན། །མ་འོངས་རྟག་པར་འགྱུར་རམ་ཅི། ། སྐྱེ་བ་མེད་ཀྱང་འཇིག་པ་ལས། །གལ་ཏེ་མ་འོངས་མི་རྟག་ན། །འདས་ལ་འཇིག་པ་ཡོད་མིན་ཏེ། །དེ་ནི་རྟག་པར་ཅིས་མི་རྟོག། འདས་པ་དང་ནི་ད་ལྟར་བ། །འདི་ནི་མི་རྟག་འགྱུར་མིན་ལ། །',\n",
|
231 |
+
" 'target': 'ད་ལྟར་ བ་ ཡོད་ ཅི་ཞིག་ མེད ། ། གང་ གི་ དུས་ ཀུན་ ཡོད་ ཉིད་པ ། ། དེ་ ཡི་ མི་ རྟག་ ཉིད་ གང་ ལས ། ། འདས་པ་ ལས་ ནི་ འདས་ གྱུར་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། འདས་པ་ ལས་ ནི་ མ་ འདས་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། གལ་ཏེ་ མ་འོངས་ སྐྱེས་ ཡོད་ ན ། ། ཇི་ལྟར་ ད་ལྟར་བ ར་མི་ འགྱུར ། ། ཅི་སྟེ་ དེ་ ལ་ སྐྱེ་ མེད་ ན ། ། མ་འོངས་ རྟག་པ ར་ འགྱུར་ རམ་ ཅི ། ། སྐྱེ་བ་ མེད་ ཀྱང་ འཇིག་པ་ ལས ། ། གལ་ཏེ་ མ་འོངས་ མི་ རྟག་ ན ། ། འདས་ ལ་ འཇིག་པ་ ཡོད་ མིན་ ཏེ ། ། དེ་ ནི་ རྟག་པ ར་ ཅི ས་ མི་ རྟོག ། འདས་པ་ དང་ ནི་ ད་ལྟ ར ་བ ། ། འདི་ ནི་ མི་ རྟག་ འགྱུར་ མིན་ ལ ། །',\n",
|
232 |
+
" 'filename': 'manual_data.json'}"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
"execution_count": 221,
|
236 |
+
"metadata": {},
|
237 |
+
"output_type": "execute_result"
|
238 |
+
}
|
239 |
+
],
|
240 |
+
"source": [
|
241 |
+
"manual_dataset[0]"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"cell_type": "code",
|
246 |
+
"execution_count": 232,
|
247 |
+
"metadata": {},
|
248 |
+
"outputs": [
|
249 |
+
{
|
250 |
+
"data": {
|
251 |
+
"text/plain": [
|
252 |
+
"20278"
|
253 |
+
]
|
254 |
+
},
|
255 |
+
"execution_count": 232,
|
256 |
+
"metadata": {},
|
257 |
+
"output_type": "execute_result"
|
258 |
+
}
|
259 |
+
],
|
260 |
+
"source": [
|
261 |
+
"len(manual_dataset)"
|
262 |
+
]
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"cell_type": "code",
|
266 |
+
"execution_count": 222,
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [],
|
269 |
+
"source": [
|
270 |
+
"remaining_dataset = dataset.filter(lambda x: x[\"filename\"] != \"manual_data.json\", num_proc=10)"
|
271 |
+
]
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"cell_type": "code",
|
275 |
+
"execution_count": 223,
|
276 |
+
"metadata": {},
|
277 |
+
"outputs": [
|
278 |
+
{
|
279 |
+
"data": {
|
280 |
+
"text/plain": [
|
281 |
+
"{'source': ['གཙོ་མོའི་མགྲིན་པར་ཨྃ་དམར་པོ་འབར་བ་ལ་སེམས་གཟུང་༔'],\n",
|
282 |
+
" 'target': ['གཙོ་ མོ འི་ མགྲིན་པ ར་ ཨྃ་ དམར་པོ་ འབར་བ་ ལ་ སེམས་ གཟུང་ ༔'],\n",
|
283 |
+
" 'filename': ['UT3JT13384-005-0028.txt']}"
|
284 |
+
]
|
285 |
+
},
|
286 |
+
"execution_count": 223,
|
287 |
+
"metadata": {},
|
288 |
+
"output_type": "execute_result"
|
289 |
+
}
|
290 |
+
],
|
291 |
+
"source": [
|
292 |
+
"remaining_dataset[9:10]"
|
293 |
+
]
|
294 |
+
},
|
295 |
+
{
|
296 |
+
"cell_type": "markdown",
|
297 |
+
"metadata": {},
|
298 |
+
"source": [
|
299 |
+
"### Unigram model like Albert"
|
300 |
+
]
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"cell_type": "markdown",
|
304 |
+
"metadata": {},
|
305 |
+
"source": [
|
306 |
+
"Let's now have a look at how we can create a Unigram tokenizer like the one used for training T5. The first step is to create a `Tokenizer` with an empty `Unigram` model:"
|
307 |
+
]
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"cell_type": "code",
|
311 |
+
"execution_count": 224,
|
312 |
+
"metadata": {
|
313 |
+
"scrolled": true
|
314 |
+
},
|
315 |
+
"outputs": [
|
316 |
+
{
|
317 |
+
"name": "stdout",
|
318 |
+
"output_type": "stream",
|
319 |
+
"text": [
|
320 |
+
"Requirement already satisfied: tokenizers in /opt/conda/lib/python3.10/site-packages (0.21.0)\n",
|
321 |
+
"Requirement already satisfied: icecream in /opt/conda/lib/python3.10/site-packages (2.1.4)\n",
|
322 |
+
"Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/conda/lib/python3.10/site-packages (from tokenizers) (0.27.1)\n",
|
323 |
+
"Requirement already satisfied: colorama>=0.3.9 in /opt/conda/lib/python3.10/site-packages (from icecream) (0.4.6)\n",
|
324 |
+
"Requirement already satisfied: pygments>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.15.1)\n",
|
325 |
+
"Requirement already satisfied: executing>=2.1.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.2.0)\n",
|
326 |
+
"Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.0.5)\n",
|
327 |
+
"Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from asttokens>=2.0.1->icecream) (1.16.0)\n",
|
328 |
+
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
|
329 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.12.2)\n",
|
330 |
+
"Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (23.1)\n",
|
331 |
+
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
|
332 |
+
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2.32.3)\n",
|
333 |
+
"Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.67.1)\n",
|
334 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
|
335 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2.0.4)\n",
|
336 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (3.4)\n",
|
337 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (1.26.18)\n",
|
338 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
|
339 |
+
"WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
|
340 |
+
]
|
341 |
+
}
|
342 |
+
],
|
343 |
+
"source": [
|
344 |
+
"!pip install tokenizers icecream"
|
345 |
+
]
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"cell_type": "code",
|
349 |
+
"execution_count": 225,
|
350 |
+
"metadata": {},
|
351 |
+
"outputs": [],
|
352 |
+
"source": [
|
353 |
+
"from tokenizers import Tokenizer, decoders\n",
|
354 |
+
"from tokenizers.models import Unigram\n",
|
355 |
+
"from tokenizers import pre_tokenizers\n",
|
356 |
+
"from tokenizers.pre_tokenizers import WhitespaceSplit\n",
|
357 |
+
"from tokenizers import trainers\n",
|
358 |
+
"from icecream import ic\n",
|
359 |
+
"\n",
|
360 |
+
"tokenizer = Tokenizer(Unigram())"
|
361 |
+
]
|
362 |
+
},
|
363 |
+
{
|
364 |
+
"cell_type": "code",
|
365 |
+
"execution_count": null,
|
366 |
+
"metadata": {},
|
367 |
+
"outputs": [],
|
368 |
+
"source": [
|
369 |
+
"def batch_iterator(dataset):\n",
|
370 |
+
" for i in range(0, len(dataset), batch_size):\n",
|
371 |
+
" yield dataset[i : i + batch_size][\"target\"]"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "markdown",
|
376 |
+
"metadata": {},
|
377 |
+
"source": [
|
378 |
+
"If we want to have a quick look at how it preprocesses the inputs, we can call the `pre_tokenize_str` method:"
|
379 |
+
]
|
380 |
+
},
|
381 |
+
{
|
382 |
+
"cell_type": "code",
|
383 |
+
"execution_count": null,
|
384 |
+
"metadata": {},
|
385 |
+
"outputs": [],
|
386 |
+
"source": [
|
387 |
+
"vocab_count=32000\n",
|
388 |
+
"tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
{
|
392 |
+
"cell_type": "code",
|
393 |
+
"execution_count": null,
|
394 |
+
"metadata": {},
|
395 |
+
"outputs": [],
|
396 |
+
"source": [
|
397 |
+
"trainer = trainers.UnigramTrainer(vocab_size=vocab_count, special_tokens=[\"[CLS]\", \"[SEP]\", \"<unk>\", \"<pad>\", \"[MASK]\"], unk_token=\"<unk>\")\n",
|
398 |
+
"tokenizer.train_from_iterator(batch_iterator(manual_dataset), trainer=trainer)"
|
399 |
+
]
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"cell_type": "code",
|
403 |
+
"execution_count": null,
|
404 |
+
"metadata": {},
|
405 |
+
"outputs": [],
|
406 |
+
"source": [
|
407 |
+
"tokenizer.save(f\"./trained_tokenizer_{vocab_count}.json\")"
|
408 |
+
]
|
409 |
+
},
|
410 |
+
{
|
411 |
+
"cell_type": "code",
|
412 |
+
"execution_count": null,
|
413 |
+
"metadata": {},
|
414 |
+
"outputs": [],
|
415 |
+
"source": [
|
416 |
+
"# Load the saved tokenizer\n",
|
417 |
+
"tokenizer = Tokenizer.from_file(f\"./trained_tokenizer_{vocab_count}.json\")\n"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
{
|
421 |
+
"cell_type": "code",
|
422 |
+
"execution_count": null,
|
423 |
+
"metadata": {},
|
424 |
+
"outputs": [],
|
425 |
+
"source": [
|
426 |
+
"cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
|
427 |
+
"sep_token_id = tokenizer.token_to_id(\"[SEP]\")"
|
428 |
+
]
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"cell_type": "code",
|
432 |
+
"execution_count": 226,
|
433 |
+
"metadata": {},
|
434 |
+
"outputs": [],
|
435 |
+
"source": [
|
436 |
+
"from tokenizers import processors\n",
|
437 |
+
"from tokenizers import Tokenizer, models, processors, decoders"
|
438 |
+
]
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"cell_type": "code",
|
442 |
+
"execution_count": null,
|
443 |
+
"metadata": {},
|
444 |
+
"outputs": [],
|
445 |
+
"source": []
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 234,
|
450 |
+
"metadata": {},
|
451 |
+
"outputs": [],
|
452 |
+
"source": [
|
453 |
+
"tokenizer.post_processor = processors.TemplateProcessing(\n",
|
454 |
+
" single=\"[CLS]:0 $A:0 [SEP]:0\",\n",
|
455 |
+
" pair=\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
|
456 |
+
" special_tokens=[\n",
|
457 |
+
" (\"[CLS]\", cls_token_id),\n",
|
458 |
+
" (\"[SEP]\", sep_token_id),\n",
|
459 |
+
" ],\n",
|
460 |
+
")\n",
|
461 |
+
"tokenizer.decoder = decoders.CTC()"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": 235,
|
467 |
+
"metadata": {},
|
468 |
+
"outputs": [],
|
469 |
+
"source": [
|
470 |
+
"tokenizer_8000 = Tokenizer.from_file(f\"./trained_tokenizer_8000.json\")\n",
|
471 |
+
"tokenizer_16000 = Tokenizer.from_file(f\"./trained_tokenizer_16000.json\")\n",
|
472 |
+
"tokenizer_32000 = Tokenizer.from_file(f\"./trained_tokenizer_32000.json\")"
|
473 |
+
]
|
474 |
+
},
|
475 |
+
{
|
476 |
+
"cell_type": "code",
|
477 |
+
"execution_count": 236,
|
478 |
+
"metadata": {},
|
479 |
+
"outputs": [],
|
480 |
+
"source": [
|
481 |
+
"from transformers import AlbertTokenizerFast\n",
|
482 |
+
"\n",
|
483 |
+
"tokenizer_8000 = AlbertTokenizerFast(tokenizer_object=tokenizer_8000)\n",
|
484 |
+
"tokenizer_16000 = AlbertTokenizerFast(tokenizer_object=tokenizer_16000)\n",
|
485 |
+
"tokenizer_32000 = AlbertTokenizerFast(tokenizer_object=tokenizer_32000)\n",
|
486 |
+
"##tokenizer_64000 = AlbertTokenizerFast(tokenizer_object=tokenizer_64000)"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"cell_type": "code",
|
491 |
+
"execution_count": 237,
|
492 |
+
"metadata": {},
|
493 |
+
"outputs": [
|
494 |
+
{
|
495 |
+
"data": {
|
496 |
+
"text/plain": [
|
497 |
+
"{'source': 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔',\n",
|
498 |
+
" 'target': 'རྨི་ལམ་ ཡིན་ སྙམ་ དུ་ བསམ ༔',\n",
|
499 |
+
" 'filename': 'UT3JT13384-005-0028.txt'}"
|
500 |
+
]
|
501 |
+
},
|
502 |
+
"execution_count": 237,
|
503 |
+
"metadata": {},
|
504 |
+
"output_type": "execute_result"
|
505 |
+
}
|
506 |
+
],
|
507 |
+
"source": [
|
508 |
+
"remaining_dataset[10]"
|
509 |
+
]
|
510 |
+
},
|
511 |
+
{
|
512 |
+
"cell_type": "code",
|
513 |
+
"execution_count": 231,
|
514 |
+
"metadata": {
|
515 |
+
"scrolled": true
|
516 |
+
},
|
517 |
+
"outputs": [
|
518 |
+
{
|
519 |
+
"name": "stderr",
|
520 |
+
"output_type": "stream",
|
521 |
+
"text": [
|
522 |
+
"ic| data[\"source\"]: 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔'\n",
|
523 |
+
"ic| tokenized_data_8000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
|
524 |
+
"ic| tokenized_data_16000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
|
525 |
+
"ic| tokenized_data_32000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
|
526 |
+
"ic| data[\"source\"]: 'ཉམས་སྐྱེས་པ་ན་རླུང་སེམས་དྲག་ཏུ་གཅུན༔'\n",
|
527 |
+
"ic| tokenized_data_8000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུ', 'ན', '༔']\n",
|
528 |
+
"ic| tokenized_data_16000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n",
|
529 |
+
"ic| tokenized_data_32000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n"
|
530 |
+
]
|
531 |
+
}
|
532 |
+
],
|
533 |
+
"source": [
|
534 |
+
"for index in range(10, len(remaining_dataset)):\n",
|
535 |
+
" data = remaining_dataset[index]\n",
|
536 |
+
" if index == 12:\n",
|
537 |
+
" break\n",
|
538 |
+
" ic(data[\"source\"])\n",
|
539 |
+
" tokenized_data_8000 = tokenizer_8000.tokenize(data[\"source\"])\n",
|
540 |
+
" ic(tokenized_data_8000)\n",
|
541 |
+
" tokenized_data_16000 = tokenizer_16000.tokenize(data[\"source\"])\n",
|
542 |
+
" ic(tokenized_data_16000)\n",
|
543 |
+
" tokenized_data_32000 = tokenizer_32000.tokenize(data[\"source\"])\n",
|
544 |
+
" ic(tokenized_data_32000)\n",
|
545 |
+
" \n"
|
546 |
+
]
|
547 |
+
},
|
548 |
+
{
|
549 |
+
"cell_type": "code",
|
550 |
+
"execution_count": 240,
|
551 |
+
"metadata": {
|
552 |
+
"scrolled": true
|
553 |
+
},
|
554 |
+
"outputs": [
|
555 |
+
{
|
556 |
+
"name": "stderr",
|
557 |
+
"output_type": "stream",
|
558 |
+
"text": [
|
559 |
+
"ic| data: '༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།'\n",
|
560 |
+
"ic| tokenized_data_8000: ['༸གོང་ས་',\n",
|
561 |
+
" 'མཆོག་',\n",
|
562 |
+
" 'གི་',\n",
|
563 |
+
" 'བོད་',\n",
|
564 |
+
" 'དོན་',\n",
|
565 |
+
" 'འཐབ་རྩོད་',\n",
|
566 |
+
" 'དང་',\n",
|
567 |
+
" 'འབྲེལ་བ',\n",
|
568 |
+
" 'འི་',\n",
|
569 |
+
" 'ཕྱག་',\n",
|
570 |
+
" 'དེབ་',\n",
|
571 |
+
" 'གསར་པ་',\n",
|
572 |
+
" 'ཞིག་',\n",
|
573 |
+
" 'ཕྱི་',\n",
|
574 |
+
" 'ཟླ་',\n",
|
575 |
+
" 'གསུམ་པ',\n",
|
576 |
+
" 'འི་',\n",
|
577 |
+
" 'ནང་',\n",
|
578 |
+
" 'འདོན་',\n",
|
579 |
+
" 'སྤེལ་',\n",
|
580 |
+
" 'གནང་',\n",
|
581 |
+
" 'རྒྱུ',\n",
|
582 |
+
" '།']\n",
|
583 |
+
"ic| tokenized_data_16000: ['༸གོང་ས་',\n",
|
584 |
+
" 'མཆོག་',\n",
|
585 |
+
" 'གི་',\n",
|
586 |
+
" 'བོད་',\n",
|
587 |
+
" 'དོན་',\n",
|
588 |
+
" 'འཐབ་རྩོད་',\n",
|
589 |
+
" 'དང་',\n",
|
590 |
+
" 'འབྲེལ་བ',\n",
|
591 |
+
" 'འི་',\n",
|
592 |
+
" 'ཕྱག་',\n",
|
593 |
+
" 'དེབ་',\n",
|
594 |
+
" 'གསར་པ་',\n",
|
595 |
+
" 'ཞིག་',\n",
|
596 |
+
" 'ཕྱི་ཟླ',\n",
|
597 |
+
" '་',\n",
|
598 |
+
" 'གསུམ་པ',\n",
|
599 |
+
" 'འི་',\n",
|
600 |
+
" 'ནང་',\n",
|
601 |
+
" 'འདོན་',\n",
|
602 |
+
" 'སྤེལ་',\n",
|
603 |
+
" 'གནང་',\n",
|
604 |
+
" 'རྒྱུ',\n",
|
605 |
+
" '།']\n",
|
606 |
+
"ic| tokenized_data_32000: ['༸གོང་ས་',\n",
|
607 |
+
" 'མཆོག་',\n",
|
608 |
+
" 'གི་',\n",
|
609 |
+
" 'བོད་',\n",
|
610 |
+
" 'དོན་',\n",
|
611 |
+
" 'འཐབ་རྩོད་',\n",
|
612 |
+
" 'དང་',\n",
|
613 |
+
" 'འབྲེལ་བ',\n",
|
614 |
+
" 'འི་',\n",
|
615 |
+
" 'ཕྱག་',\n",
|
616 |
+
" 'དེབ་',\n",
|
617 |
+
" 'གསར་པ་',\n",
|
618 |
+
" 'ཞིག་',\n",
|
619 |
+
" 'ཕྱི་ཟླ',\n",
|
620 |
+
" '་',\n",
|
621 |
+
" 'གསུམ་པ',\n",
|
622 |
+
" 'འི་',\n",
|
623 |
+
" 'ནང་',\n",
|
624 |
+
" 'འདོན་',\n",
|
625 |
+
" 'སྤེལ་',\n",
|
626 |
+
" 'གནང་',\n",
|
627 |
+
" 'རྒྱུ',\n",
|
628 |
+
" '།']\n",
|
629 |
+
"ic| tokenizer_8000.encode(data): [0,\n",
|
630 |
+
" 2163,\n",
|
631 |
+
" 152,\n",
|
632 |
+
" 25,\n",
|
633 |
+
" 201,\n",
|
634 |
+
" 47,\n",
|
635 |
+
" 3426,\n",
|
636 |
+
" 9,\n",
|
637 |
+
" 662,\n",
|
638 |
+
" 7,\n",
|
639 |
+
" 267,\n",
|
640 |
+
" 1522,\n",
|
641 |
+
" 2426,\n",
|
642 |
+
" 59,\n",
|
643 |
+
" 256,\n",
|
644 |
+
" 636,\n",
|
645 |
+
" 348,\n",
|
646 |
+
" 7,\n",
|
647 |
+
" 85,\n",
|
648 |
+
" 1067,\n",
|
649 |
+
" 1238,\n",
|
650 |
+
" 717,\n",
|
651 |
+
" 246,\n",
|
652 |
+
" 5,\n",
|
653 |
+
" 1]\n"
|
654 |
+
]
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"data": {
|
658 |
+
"text/plain": [
|
659 |
+
"[0,\n",
|
660 |
+
" 2163,\n",
|
661 |
+
" 152,\n",
|
662 |
+
" 25,\n",
|
663 |
+
" 201,\n",
|
664 |
+
" 47,\n",
|
665 |
+
" 3426,\n",
|
666 |
+
" 9,\n",
|
667 |
+
" 662,\n",
|
668 |
+
" 7,\n",
|
669 |
+
" 267,\n",
|
670 |
+
" 1522,\n",
|
671 |
+
" 2426,\n",
|
672 |
+
" 59,\n",
|
673 |
+
" 256,\n",
|
674 |
+
" 636,\n",
|
675 |
+
" 348,\n",
|
676 |
+
" 7,\n",
|
677 |
+
" 85,\n",
|
678 |
+
" 1067,\n",
|
679 |
+
" 1238,\n",
|
680 |
+
" 717,\n",
|
681 |
+
" 246,\n",
|
682 |
+
" 5,\n",
|
683 |
+
" 1]"
|
684 |
+
]
|
685 |
+
},
|
686 |
+
"execution_count": 240,
|
687 |
+
"metadata": {},
|
688 |
+
"output_type": "execute_result"
|
689 |
+
}
|
690 |
+
],
|
691 |
+
"source": [
|
692 |
+
"data = \"༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།\"\n",
|
693 |
+
"ic(data) \n",
|
694 |
+
"tokenized_data_8000 = tokenizer_8000.tokenize(data)\n",
|
695 |
+
"ic(tokenized_data_8000)\n",
|
696 |
+
"tokenized_data_16000 = tokenizer_16000.tokenize(data)\n",
|
697 |
+
"ic(tokenized_data_16000)\n",
|
698 |
+
"tokenized_data_32000 = tokenizer_32000.tokenize(data)\n",
|
699 |
+
"ic(tokenized_data_32000)\n",
|
700 |
+
"ic(tokenizer_8000.encode(data))"
|
701 |
+
]
|
702 |
+
},
|
703 |
+
{
|
704 |
+
"cell_type": "code",
|
705 |
+
"execution_count": null,
|
706 |
+
"metadata": {},
|
707 |
+
"outputs": [],
|
708 |
+
"source": []
|
709 |
+
},
|
710 |
+
{
|
711 |
+
"cell_type": "code",
|
712 |
+
"execution_count": 152,
|
713 |
+
"metadata": {},
|
714 |
+
"outputs": [
|
715 |
+
{
|
716 |
+
"data": {
|
717 |
+
"text/plain": [
|
718 |
+
"['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
|
719 |
+
]
|
720 |
+
},
|
721 |
+
"execution_count": 152,
|
722 |
+
"metadata": {},
|
723 |
+
"output_type": "execute_result"
|
724 |
+
}
|
725 |
+
],
|
726 |
+
"source": [
|
727 |
+
"tokenized_data"
|
728 |
+
]
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"cell_type": "code",
|
732 |
+
"execution_count": 166,
|
733 |
+
"metadata": {},
|
734 |
+
"outputs": [
|
735 |
+
{
|
736 |
+
"data": {
|
737 |
+
"text/plain": [
|
738 |
+
"['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
|
739 |
+
]
|
740 |
+
},
|
741 |
+
"execution_count": 166,
|
742 |
+
"metadata": {},
|
743 |
+
"output_type": "execute_result"
|
744 |
+
}
|
745 |
+
],
|
746 |
+
"source": [
|
747 |
+
"tokenized_data_16000"
|
748 |
+
]
|
749 |
+
},
|
750 |
+
{
|
751 |
+
"cell_type": "code",
|
752 |
+
"execution_count": null,
|
753 |
+
"metadata": {},
|
754 |
+
"outputs": [],
|
755 |
+
"source": [
|
756 |
+
"tokenized_data_32000"
|
757 |
+
]
|
758 |
+
},
|
759 |
+
{
|
760 |
+
"cell_type": "markdown",
|
761 |
+
"metadata": {},
|
762 |
+
"source": [
|
763 |
+
"## Use your new tokenizer to train a language model!"
|
764 |
+
]
|
765 |
+
},
|
766 |
+
{
|
767 |
+
"cell_type": "markdown",
|
768 |
+
"metadata": {},
|
769 |
+
"source": [
|
770 |
+
"You can either use your new tokenizer in the language modeling from scratch notebook [Link to come] or use the `--tokenizer_name` argument in the [language modeling scripts](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) to use it there to train a model from scratch."
|
771 |
+
]
|
772 |
+
},
|
773 |
+
{
|
774 |
+
"cell_type": "code",
|
775 |
+
"execution_count": 241,
|
776 |
+
"metadata": {},
|
777 |
+
"outputs": [
|
778 |
+
{
|
779 |
+
"data": {
|
780 |
+
"application/vnd.jupyter.widget-view+json": {
|
781 |
+
"model_id": "a22f05e792ee40b79bf097340ae38a2a",
|
782 |
+
"version_major": 2,
|
783 |
+
"version_minor": 0
|
784 |
+
},
|
785 |
+
"text/plain": [
|
786 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
787 |
+
]
|
788 |
+
},
|
789 |
+
"metadata": {},
|
790 |
+
"output_type": "display_data"
|
791 |
+
}
|
792 |
+
],
|
793 |
+
"source": [
|
794 |
+
"notebook_login()\n"
|
795 |
+
]
|
796 |
+
},
|
797 |
+
{
|
798 |
+
"cell_type": "code",
|
799 |
+
"execution_count": 253,
|
800 |
+
"metadata": {},
|
801 |
+
"outputs": [
|
802 |
+
{
|
803 |
+
"ename": "HfHubHTTPError",
|
804 |
+
"evalue": "409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo",
|
805 |
+
"output_type": "error",
|
806 |
+
"traceback": [
|
807 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
808 |
+
"\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)",
|
809 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:406\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 406\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
|
810 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/requests/models.py:1024\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1024\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
|
811 |
+
"\u001b[0;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create",
|
812 |
+
"\nThe above exception was the direct cause of the following exception:\n",
|
813 |
+
"\u001b[0;31mHfHubHTTPError\u001b[0m Traceback (most recent call last)",
|
814 |
+
"Cell \u001b[0;32mIn[253], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m login, create_repo, Repository\n\u001b[1;32m 3\u001b[0m repo_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mta4tsering/NLP-Unigram_language_model_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m repo_url \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_repo\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprivate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository created: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
|
815 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
816 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py:3525\u001b[0m, in \u001b[0;36mHfApi.create_repo\u001b[0;34m(self, repo_id, token, private, repo_type, exist_ok, resource_group_id, space_sdk, space_hardware, space_storage, space_sleep_time, space_secrets, space_variables)\u001b[0m\n\u001b[1;32m 3522\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 3524\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3525\u001b[0m \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3526\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 3527\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m exist_ok \u001b[38;5;129;01mand\u001b[39;00m err\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m409\u001b[39m:\n\u001b[1;32m 3528\u001b[0m \u001b[38;5;66;03m# Repo already exists and `exist_ok=True`\u001b[39;00m\n",
|
817 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:477\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 475\u001b[0m \u001b[38;5;66;03m# Convert `HTTPError` into a `HfHubHTTPError` to display request information\u001b[39;00m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;66;03m# as well (request id and/or server error message)\u001b[39;00m\n\u001b[0;32m--> 477\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, \u001b[38;5;28mstr\u001b[39m(e), response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
|
818 |
+
"\u001b[0;31mHfHubHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo"
|
819 |
+
]
|
820 |
+
}
|
821 |
+
],
|
822 |
+
"source": [
|
823 |
+
"from huggingface_hub import login, create_repo, Repository\n",
|
824 |
+
"\n",
|
825 |
+
"repo_name = \"ta4tsering/NLP-Unigram_language_model_tokenizer\"\n",
|
826 |
+
"repo_url = create_repo(repo_name, repo_type=\"model\", private=False)\n",
|
827 |
+
"print(f\"Repository created: {repo_url}\")"
|
828 |
+
]
|
829 |
+
},
|
830 |
+
{
|
831 |
+
"cell_type": "code",
|
832 |
+
"execution_count": 248,
|
833 |
+
"metadata": {
|
834 |
+
"scrolled": true
|
835 |
+
},
|
836 |
+
"outputs": [
|
837 |
+
{
|
838 |
+
"name": "stdout",
|
839 |
+
"output_type": "stream",
|
840 |
+
"text": [
|
841 |
+
"Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
|
842 |
+
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
|
843 |
+
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
|
844 |
+
"Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
|
845 |
+
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
|
846 |
+
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
|
847 |
+
"Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
|
848 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
|
849 |
+
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
|
850 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
|
851 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
|
852 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
|
853 |
+
"WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
|
854 |
+
]
|
855 |
+
}
|
856 |
+
],
|
857 |
+
"source": [
|
858 |
+
"!pip install --upgrade huggingface_hub"
|
859 |
+
]
|
860 |
+
},
|
861 |
+
{
|
862 |
+
"cell_type": "code",
|
863 |
+
"execution_count": 256,
|
864 |
+
"metadata": {
|
865 |
+
"scrolled": true
|
866 |
+
},
|
867 |
+
"outputs": [
|
868 |
+
{
|
869 |
+
"name": "stderr",
|
870 |
+
"output_type": "stream",
|
871 |
+
"text": [
|
872 |
+
"/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\n",
|
873 |
+
"For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n",
|
874 |
+
" warnings.warn(warning_message, FutureWarning)\n"
|
875 |
+
]
|
876 |
+
},
|
877 |
+
{
|
878 |
+
"ename": "OSError",
|
879 |
+
"evalue": "Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).",
|
880 |
+
"output_type": "error",
|
881 |
+
"traceback": [
|
882 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
883 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
884 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:592\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m \u001b[43mrun_subprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit-lfs --version\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocal_dir\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
|
885 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_subprocess.py:83\u001b[0m, in \u001b[0;36mrun_subprocess\u001b[0;34m(command, folder, check, **kwargs)\u001b[0m\n\u001b[1;32m 81\u001b[0m folder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(folder)\n\u001b[0;32m---> 83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mstderr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43mstdout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreplace\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# if not utf-8, replace char by �\u001b[39;49;00m\n\u001b[1;32m 90\u001b[0m \u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfolder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetcwd\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 92\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
886 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:503\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 501\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstderr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m PIPE\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpopenargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
|
887 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:971\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mTextIOWrapper(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr,\n\u001b[1;32m 969\u001b[0m encoding\u001b[38;5;241m=\u001b[39mencoding, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m--> 971\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_child\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecutable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpreexec_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclose_fds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mpass_fds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartupinfo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreationflags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mp2cread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp2cwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[43mc2pread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mc2pwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43merrread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43mrestore_signals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mgid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mumask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mstart_new_session\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 980\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 981\u001b[0m \u001b[38;5;66;03m# Cleanup if the child failed starting.\u001b[39;00m\n",
|
888 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:1863\u001b[0m, in \u001b[0;36mPopen._execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m 1862\u001b[0m err_msg \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mstrerror(errno_num)\n\u001b[0;32m-> 1863\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(errno_num, err_msg, err_filename)\n\u001b[1;32m 1864\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(err_msg)\n",
|
889 |
+
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'git-lfs'",
|
890 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
891 |
+
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
892 |
+
"Cell \u001b[0;32mIn[256], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m repo \u001b[38;5;241m=\u001b[39m \u001b[43mRepository\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclone_from\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_url\u001b[49m\u001b[43m)\u001b[49m\n",
|
893 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
894 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:132\u001b[0m, in \u001b[0;36m_deprecate_method.<locals>._inner_deprecate_method.<locals>.inner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 130\u001b[0m warning_message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m message\n\u001b[1;32m 131\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(warning_message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
895 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:522\u001b[0m, in \u001b[0;36mRepository.__init__\u001b[0;34m(self, local_dir, clone_from, repo_type, token, git_user, git_email, revision, skip_lfs_files, client)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mskip_lfs_files \u001b[38;5;241m=\u001b[39m skip_lfs_files\n\u001b[1;32m 520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m client \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m HfApi()\n\u001b[0;32m--> 522\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_git_versions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhuggingface_token: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m token\n",
|
896 |
+
"File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:594\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m run_subprocess(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit-lfs --version\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlocal_dir)\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m 595\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLooks like you do not have git-lfs installed, please install.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 596\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m You can install from https://git-lfs.github.com/.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Then run `git lfs install` (you only have to do this once).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 598\u001b[0m )\n\u001b[1;32m 599\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(git_version \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m lfs_version)\n",
|
897 |
+
"\u001b[0;31mOSError\u001b[0m: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once)."
|
898 |
+
]
|
899 |
+
}
|
900 |
+
],
|
901 |
+
"source": [
|
902 |
+
"repo_path = \"/home\"\n",
|
903 |
+
"repo = Repository(local_dir=repo_path, clone_from=repo_url)"
|
904 |
+
]
|
905 |
+
},
|
906 |
+
{
|
907 |
+
"cell_type": "code",
|
908 |
+
"execution_count": null,
|
909 |
+
"metadata": {},
|
910 |
+
"outputs": [],
|
911 |
+
"source": [
|
912 |
+
"import os\n",
|
913 |
+
"from huggingface_hub import upload_file\n",
|
914 |
+
"\n",
|
915 |
+
"# Define the local folder and repo_id\n",
|
916 |
+
"folder_path = \"/home/NLP-Unigram_language_model_tokenizer/\" # Local folder path (should match your repo_id)\n",
|
917 |
+
"repo_id = \"ta4tsering/NLP-Unigram_language_model_tokenizer\" # Replace with your Hugging Face repo ID\n",
|
918 |
+
"\n",
|
919 |
+
"# Iterate through all files in the folder\n",
|
920 |
+
"for root, _, files in os.walk(folder_path):\n",
|
921 |
+
" for file_name in files:\n",
|
922 |
+
" local_file_path = os.path.join(root, file_name)\n",
|
923 |
+
" repo_file_path = os.path.relpath(local_file_path, folder_path) # Keep folder structure\n",
|
924 |
+
"\n",
|
925 |
+
" # Upload file to the repo\n",
|
926 |
+
" upload_file(\n",
|
927 |
+
" path_or_fileobj=local_file_path,\n",
|
928 |
+
" path_in_repo=repo_file_path,\n",
|
929 |
+
" repo_id=repo_id,\n",
|
930 |
+
" repo_type=\"model\",\n",
|
931 |
+
" commit_message=f\"Add {repo_file_path}\",\n",
|
932 |
+
" )\n",
|
933 |
+
" print(f\"Uploaded {repo_file_path}\")\n"
|
934 |
+
]
|
935 |
+
},
|
936 |
+
{
|
937 |
+
"cell_type": "code",
|
938 |
+
"execution_count": 257,
|
939 |
+
"metadata": {},
|
940 |
+
"outputs": [
|
941 |
+
{
|
942 |
+
"name": "stdout",
|
943 |
+
"output_type": "stream",
|
944 |
+
"text": [
|
945 |
+
"git: 'lfs' is not a git command. See 'git --help'.\n",
|
946 |
+
"\n",
|
947 |
+
"The most similar command is\n",
|
948 |
+
"\tlog\n"
|
949 |
+
]
|
950 |
+
}
|
951 |
+
],
|
952 |
+
"source": [
|
953 |
+
"!git lfs install"
|
954 |
+
]
|
955 |
+
},
|
956 |
+
{
|
957 |
+
"cell_type": "code",
|
958 |
+
"execution_count": null,
|
959 |
+
"metadata": {},
|
960 |
+
"outputs": [],
|
961 |
+
"source": [
|
962 |
+
"repo.push_to_hub(commit_message=\"Initial commit\")\n",
|
963 |
+
"print(\"Files pushed to Hugging Face!\")"
|
964 |
+
]
|
965 |
+
}
|
966 |
+
],
|
967 |
+
"metadata": {
|
968 |
+
"colab": {
|
969 |
+
"name": "Train your tokenizer",
|
970 |
+
"provenance": []
|
971 |
+
},
|
972 |
+
"kernelspec": {
|
973 |
+
"display_name": "Python 3 (ipykernel)",
|
974 |
+
"language": "python",
|
975 |
+
"name": "python3"
|
976 |
+
},
|
977 |
+
"language_info": {
|
978 |
+
"codemirror_mode": {
|
979 |
+
"name": "ipython",
|
980 |
+
"version": 3
|
981 |
+
},
|
982 |
+
"file_extension": ".py",
|
983 |
+
"mimetype": "text/x-python",
|
984 |
+
"name": "python",
|
985 |
+
"nbconvert_exporter": "python",
|
986 |
+
"pygments_lexer": "ipython3",
|
987 |
+
"version": "3.10.13"
|
988 |
+
}
|
989 |
+
},
|
990 |
+
"nbformat": 4,
|
991 |
+
"nbformat_minor": 4
|
992 |
+
}
|