ta4tsering commited on
Commit
78a33e3
·
verified ·
1 Parent(s): 2e9f8a9

Add tokenizer_training.ipynb

Browse files
Files changed (1) hide show
  1. tokenizer_training.ipynb +992 -0
tokenizer_training.ipynb ADDED
@@ -0,0 +1,992 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 213,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/",
9
+ "height": 1000
10
+ },
11
+ "id": "MOsHUjgdIrIW",
12
+ "outputId": "f84a093e-147f-470e-aad9-80fb51193c8e",
13
+ "scrolled": true
14
+ },
15
+ "outputs": [
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "Requirement already satisfied: datasets in /opt/conda/lib/python3.10/site-packages (3.2.0)\n",
21
+ "Requirement already satisfied: transformers[sentencepiece] in /opt/conda/lib/python3.10/site-packages (4.48.1)\n",
22
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from datasets) (3.13.1)\n",
23
+ "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.26.3)\n",
24
+ "Requirement already satisfied: pyarrow>=15.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (19.0.0)\n",
25
+ "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.8)\n",
26
+ "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.2.3)\n",
27
+ "Requirement already satisfied: requests>=2.32.2 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.32.3)\n",
28
+ "Requirement already satisfied: tqdm>=4.66.3 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.67.1)\n",
29
+ "Requirement already satisfied: xxhash in /opt/conda/lib/python3.10/site-packages (from datasets) (3.5.0)\n",
30
+ "Requirement already satisfied: multiprocess<0.70.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.70.16)\n",
31
+ "Requirement already satisfied: fsspec<=2024.9.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets) (2023.12.2)\n",
32
+ "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from datasets) (3.11.11)\n",
33
+ "Requirement already satisfied: huggingface-hub>=0.23.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.27.1)\n",
34
+ "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (23.1)\n",
35
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0.1)\n",
36
+ "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (2024.11.6)\n",
37
+ "Requirement already satisfied: tokenizers<0.22,>=0.21 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.21.0)\n",
38
+ "Requirement already satisfied: safetensors>=0.4.1 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.5.2)\n",
39
+ "Requirement already satisfied: sentencepiece!=0.1.92,>=0.1.91 in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (0.2.0)\n",
40
+ "Requirement already satisfied: protobuf in /opt/conda/lib/python3.10/site-packages (from transformers[sentencepiece]) (5.29.3)\n",
41
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.4)\n",
42
+ "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.2)\n",
43
+ "Requirement already satisfied: async-timeout<6.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n",
44
+ "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (23.1.0)\n",
45
+ "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n",
46
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n",
47
+ "Requirement already satisfied: propcache>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.1)\n",
48
+ "Requirement already satisfied: yarl<2.0,>=1.17.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.3)\n",
49
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.23.0->datasets) (4.9.0)\n",
50
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.0.4)\n",
51
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4)\n",
52
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (1.26.18)\n",
53
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2023.11.17)\n",
54
+ "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n",
55
+ "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3.post1)\n",
56
+ "Requirement already satisfied: tzdata>=2022.7 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2025.1)\n",
57
+ "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
58
+ "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
59
+ ]
60
+ }
61
+ ],
62
+ "source": [
63
+ "!pip install datasets transformers[sentencepiece]"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": 214,
69
+ "metadata": {
70
+ "scrolled": true
71
+ },
72
+ "outputs": [
73
+ {
74
+ "name": "stdout",
75
+ "output_type": "stream",
76
+ "text": [
77
+ "Sun Jan 26 12:49:45 2025 \n",
78
+ "+-----------------------------------------------------------------------------------------+\n",
79
+ "| NVIDIA-SMI 550.76 Driver Version: 550.76 CUDA Version: 12.4 |\n",
80
+ "|-----------------------------------------+------------------------+----------------------+\n",
81
+ "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
82
+ "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
83
+ "| | | MIG M. |\n",
84
+ "|=========================================+========================+======================|\n",
85
+ "| 0 NVIDIA GeForce RTX 4090 On | 00000000:E2:00.0 Off | Off |\n",
86
+ "| 0% 31C P8 19W / 450W | 1MiB / 24564MiB | 0% Default |\n",
87
+ "| | | N/A |\n",
88
+ "+-----------------------------------------+------------------------+----------------------+\n",
89
+ " \n",
90
+ "+-----------------------------------------------------------------------------------------+\n",
91
+ "| Processes: |\n",
92
+ "| GPU GI CI PID Type Process name GPU Memory |\n",
93
+ "| ID ID Usage |\n",
94
+ "|=========================================================================================|\n",
95
+ "| No running processes found |\n",
96
+ "+-----------------------------------------------------------------------------------------+\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "!nvidia-smi"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "markdown",
106
+ "metadata": {
107
+ "id": "HFASsisvIrIb"
108
+ },
109
+ "source": [
110
+ "If you're opening this notebook locally, make sure your environment has an install from the last version of Datasets and a source install of Transformers."
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 215,
116
+ "metadata": {
117
+ "scrolled": true
118
+ },
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
125
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
126
+ "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
127
+ "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
128
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
129
+ "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
130
+ "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
131
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
132
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
133
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
134
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
135
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
136
+ "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
137
+ ]
138
+ }
139
+ ],
140
+ "source": [
141
+ "!pip install huggingface_hub"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": 216,
147
+ "metadata": {},
148
+ "outputs": [],
149
+ "source": [
150
+ "!git config --global credential.helper store"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "metadata": {},
156
+ "source": [
157
+ "## Getting a corpus"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "markdown",
162
+ "metadata": {},
163
+ "source": [
164
+ "We will need texts to train our tokenizer. We will use the [🤗 Datasets](https://github.com/huggingface/datasets) library to download our text data, which can be easily done with the `load_dataset` function:"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": 217,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "from datasets import load_dataset"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 218,
179
+ "metadata": {},
180
+ "outputs": [
181
+ {
182
+ "data": {
183
+ "application/vnd.jupyter.widget-view+json": {
184
+ "model_id": "5af05419ecdb43f9933ce463de99f18a",
185
+ "version_major": 2,
186
+ "version_minor": 0
187
+ },
188
+ "text/plain": [
189
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
190
+ ]
191
+ },
192
+ "metadata": {},
193
+ "output_type": "display_data"
194
+ }
195
+ ],
196
+ "source": [
197
+ "from huggingface_hub import notebook_login\n",
198
+ "\n",
199
+ "notebook_login()"
200
+ ]
201
+ },
202
+ {
203
+ "cell_type": "code",
204
+ "execution_count": 219,
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "dataset = load_dataset(\"openpecha/deduplication_combined_word_seg_data\", name=\"\", split=\"train\")"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": 220,
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "manual_dataset = dataset.filter(lambda x: x[\"filename\"] == \"manual_data.json\", num_proc=10)"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 221,
223
+ "metadata": {
224
+ "scrolled": true
225
+ },
226
+ "outputs": [
227
+ {
228
+ "data": {
229
+ "text/plain": [
230
+ "{'source': 'ད་ལྟར་བ་ཡོད་ཅི་ཞིག་མེད། །གང་གི་དུས་ཀུན་ཡོད་ཉིད་པ། །དེ་ཡི་མི་རྟག་ཉིད་གང་ལས། ། འདས་པ་ལས་ནི་འདས་གྱུར་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། ། འདས་པ་ལས་ནི་མ་འདས་པ། །ཅི་ཡི་ཕྱིར་ན་འདས་པར་འགྱུར། །གལ་ཏེ་མ་འོངས་སྐྱེས་ཡོད་ན། །ཇི་ལྟར་ད་ལྟར་བར་མི་འགྱུར། །ཅི་སྟེ་དེ་ལ་སྐྱེ་མེད་ན། །མ་འོངས་རྟག་པར་འགྱུར་རམ་ཅི། ། སྐྱེ་བ་མེད་ཀྱང་འཇིག་པ་ལས། །གལ་ཏེ་མ་འོངས་མི་རྟག་ན། །འདས་ལ་འཇིག་པ་ཡོད་མིན་ཏེ། །དེ་ནི་རྟག་པར་ཅིས་མི་རྟོག། འདས་པ་དང་ནི་ད་ལྟར་བ། །འདི་ནི་མི་རྟག་འགྱུར་མིན་ལ། །',\n",
231
+ " 'target': 'ད་ལྟར་ བ་ ཡོད་ ཅི་ཞིག་ མེད ། ། གང་ གི་ དུས་ ཀུན་ ཡོད་ ཉིད་པ ། ། དེ་ ཡི་ མི་ རྟག་ ཉིད་ གང་ ལས ། ། འདས་པ་ ལས་ ནི་ འདས་ གྱུར་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། འདས་པ་ ལས་ ནི་ མ་ འདས་པ ། ། ཅི་ ཡི་ ཕྱིར་ ན་ འདས་པ ར་ འགྱུར ། ། གལ་ཏེ་ མ་འོངས་ སྐྱེས་ ཡོད་ ན ། ། ཇི་ལྟར་ ད་ལྟར་བ ར་མི་ འགྱུར ། ། ཅི་སྟེ་ དེ་ ལ་ སྐྱེ་ མེད་ ན ། ། མ་འོངས་ རྟག་པ ར་ འགྱུར་ རམ་ ཅི ། ། སྐྱེ་བ་ མེད་ ཀྱང་ འཇིག་པ་ ལས ། ། གལ་ཏེ་ མ་འོངས་ མི་ རྟག་ ན ། ། འདས་ ལ་ འཇིག་པ་ ཡོད་ མིན་ ཏེ ། ། དེ་ ནི་ རྟག་པ ར་ ཅི ས་ མི་ རྟོག ། འདས་པ་ དང་ ནི་ ད་ལྟ ར ་བ ། ། འདི་ ནི་ མི་ རྟག་ འགྱུར་ མིན་ ལ ། །',\n",
232
+ " 'filename': 'manual_data.json'}"
233
+ ]
234
+ },
235
+ "execution_count": 221,
236
+ "metadata": {},
237
+ "output_type": "execute_result"
238
+ }
239
+ ],
240
+ "source": [
241
+ "manual_dataset[0]"
242
+ ]
243
+ },
244
+ {
245
+ "cell_type": "code",
246
+ "execution_count": 232,
247
+ "metadata": {},
248
+ "outputs": [
249
+ {
250
+ "data": {
251
+ "text/plain": [
252
+ "20278"
253
+ ]
254
+ },
255
+ "execution_count": 232,
256
+ "metadata": {},
257
+ "output_type": "execute_result"
258
+ }
259
+ ],
260
+ "source": [
261
+ "len(manual_dataset)"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "code",
266
+ "execution_count": 222,
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "remaining_dataset = dataset.filter(lambda x: x[\"filename\"] != \"manual_data.json\", num_proc=10)"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": 223,
276
+ "metadata": {},
277
+ "outputs": [
278
+ {
279
+ "data": {
280
+ "text/plain": [
281
+ "{'source': ['གཙོ་མོའི་མགྲིན་པར་ཨྃ་དམར་པོ་འབར་བ་ལ་སེམས་གཟུང་༔'],\n",
282
+ " 'target': ['གཙོ་ མོ འི་ མགྲིན་པ ར་ ཨྃ་ དམར་པོ་ འབར་བ་ ལ་ སེམས་ གཟུང་ ༔'],\n",
283
+ " 'filename': ['UT3JT13384-005-0028.txt']}"
284
+ ]
285
+ },
286
+ "execution_count": 223,
287
+ "metadata": {},
288
+ "output_type": "execute_result"
289
+ }
290
+ ],
291
+ "source": [
292
+ "remaining_dataset[9:10]"
293
+ ]
294
+ },
295
+ {
296
+ "cell_type": "markdown",
297
+ "metadata": {},
298
+ "source": [
299
+ "### Unigram model like Albert"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "markdown",
304
+ "metadata": {},
305
+ "source": [
306
+ "Let's now have a look at how we can create a Unigram tokenizer like the one used for training T5. The first step is to create a `Tokenizer` with an empty `Unigram` model:"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 224,
312
+ "metadata": {
313
+ "scrolled": true
314
+ },
315
+ "outputs": [
316
+ {
317
+ "name": "stdout",
318
+ "output_type": "stream",
319
+ "text": [
320
+ "Requirement already satisfied: tokenizers in /opt/conda/lib/python3.10/site-packages (0.21.0)\n",
321
+ "Requirement already satisfied: icecream in /opt/conda/lib/python3.10/site-packages (2.1.4)\n",
322
+ "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /opt/conda/lib/python3.10/site-packages (from tokenizers) (0.27.1)\n",
323
+ "Requirement already satisfied: colorama>=0.3.9 in /opt/conda/lib/python3.10/site-packages (from icecream) (0.4.6)\n",
324
+ "Requirement already satisfied: pygments>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.15.1)\n",
325
+ "Requirement already satisfied: executing>=2.1.0 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.2.0)\n",
326
+ "Requirement already satisfied: asttokens>=2.0.1 in /opt/conda/lib/python3.10/site-packages (from icecream) (2.0.5)\n",
327
+ "Requirement already satisfied: six in /opt/conda/lib/python3.10/site-packages (from asttokens>=2.0.1->icecream) (1.16.0)\n",
328
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (3.13.1)\n",
329
+ "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.12.2)\n",
330
+ "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (23.1)\n",
331
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (6.0.1)\n",
332
+ "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (2.32.3)\n",
333
+ "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.67.1)\n",
334
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0,>=0.16.4->tokenizers) (4.9.0)\n",
335
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2.0.4)\n",
336
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (3.4)\n",
337
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (1.26.18)\n",
338
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub<1.0,>=0.16.4->tokenizers) (2023.11.17)\n",
339
+ "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
340
+ ]
341
+ }
342
+ ],
343
+ "source": [
344
+ "!pip install tokenizers icecream"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": 225,
350
+ "metadata": {},
351
+ "outputs": [],
352
+ "source": [
353
+ "from tokenizers import Tokenizer, decoders\n",
354
+ "from tokenizers.models import Unigram\n",
355
+ "from tokenizers import pre_tokenizers\n",
356
+ "from tokenizers.pre_tokenizers import WhitespaceSplit\n",
357
+ "from tokenizers import trainers\n",
358
+ "from icecream import ic\n",
359
+ "\n",
360
+ "tokenizer = Tokenizer(Unigram())"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": null,
366
+ "metadata": {},
367
+ "outputs": [],
368
+ "source": [
369
+ "def batch_iterator(dataset):\n",
370
+ " for i in range(0, len(dataset), batch_size):\n",
371
+ " yield dataset[i : i + batch_size][\"target\"]"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "markdown",
376
+ "metadata": {},
377
+ "source": [
378
+ "If we want to have a quick look at how it preprocesses the inputs, we can call the `pre_tokenize_str` method:"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": null,
384
+ "metadata": {},
385
+ "outputs": [],
386
+ "source": [
387
+ "vocab_count=32000\n",
388
+ "tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()"
389
+ ]
390
+ },
391
+ {
392
+ "cell_type": "code",
393
+ "execution_count": null,
394
+ "metadata": {},
395
+ "outputs": [],
396
+ "source": [
397
+ "trainer = trainers.UnigramTrainer(vocab_size=vocab_count, special_tokens=[\"[CLS]\", \"[SEP]\", \"<unk>\", \"<pad>\", \"[MASK]\"], unk_token=\"<unk>\")\n",
398
+ "tokenizer.train_from_iterator(batch_iterator(manual_dataset), trainer=trainer)"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": null,
404
+ "metadata": {},
405
+ "outputs": [],
406
+ "source": [
407
+ "tokenizer.save(f\"./trained_tokenizer_{vocab_count}.json\")"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": null,
413
+ "metadata": {},
414
+ "outputs": [],
415
+ "source": [
416
+ "# Load the saved tokenizer\n",
417
+ "tokenizer = Tokenizer.from_file(f\"./trained_tokenizer_{vocab_count}.json\")\n"
418
+ ]
419
+ },
420
+ {
421
+ "cell_type": "code",
422
+ "execution_count": null,
423
+ "metadata": {},
424
+ "outputs": [],
425
+ "source": [
426
+ "cls_token_id = tokenizer.token_to_id(\"[CLS]\")\n",
427
+ "sep_token_id = tokenizer.token_to_id(\"[SEP]\")"
428
+ ]
429
+ },
430
+ {
431
+ "cell_type": "code",
432
+ "execution_count": 226,
433
+ "metadata": {},
434
+ "outputs": [],
435
+ "source": [
436
+ "from tokenizers import processors\n",
437
+ "from tokenizers import Tokenizer, models, processors, decoders"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": null,
443
+ "metadata": {},
444
+ "outputs": [],
445
+ "source": []
446
+ },
447
+ {
448
+ "cell_type": "code",
449
+ "execution_count": 234,
450
+ "metadata": {},
451
+ "outputs": [],
452
+ "source": [
453
+ "tokenizer.post_processor = processors.TemplateProcessing(\n",
454
+ " single=\"[CLS]:0 $A:0 [SEP]:0\",\n",
455
+ " pair=\"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1\",\n",
456
+ " special_tokens=[\n",
457
+ " (\"[CLS]\", cls_token_id),\n",
458
+ " (\"[SEP]\", sep_token_id),\n",
459
+ " ],\n",
460
+ ")\n",
461
+ "tokenizer.decoder = decoders.CTC()"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": 235,
467
+ "metadata": {},
468
+ "outputs": [],
469
+ "source": [
470
+ "tokenizer_8000 = Tokenizer.from_file(f\"./trained_tokenizer_8000.json\")\n",
471
+ "tokenizer_16000 = Tokenizer.from_file(f\"./trained_tokenizer_16000.json\")\n",
472
+ "tokenizer_32000 = Tokenizer.from_file(f\"./trained_tokenizer_32000.json\")"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": 236,
478
+ "metadata": {},
479
+ "outputs": [],
480
+ "source": [
481
+ "from transformers import AlbertTokenizerFast\n",
482
+ "\n",
483
+ "tokenizer_8000 = AlbertTokenizerFast(tokenizer_object=tokenizer_8000)\n",
484
+ "tokenizer_16000 = AlbertTokenizerFast(tokenizer_object=tokenizer_16000)\n",
485
+ "tokenizer_32000 = AlbertTokenizerFast(tokenizer_object=tokenizer_32000)\n",
486
+ "##tokenizer_64000 = AlbertTokenizerFast(tokenizer_object=tokenizer_64000)"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 237,
492
+ "metadata": {},
493
+ "outputs": [
494
+ {
495
+ "data": {
496
+ "text/plain": [
497
+ "{'source': 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔',\n",
498
+ " 'target': 'རྨི་ལམ་ ཡིན་ སྙམ་ དུ་ བསམ ༔',\n",
499
+ " 'filename': 'UT3JT13384-005-0028.txt'}"
500
+ ]
501
+ },
502
+ "execution_count": 237,
503
+ "metadata": {},
504
+ "output_type": "execute_result"
505
+ }
506
+ ],
507
+ "source": [
508
+ "remaining_dataset[10]"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": 231,
514
+ "metadata": {
515
+ "scrolled": true
516
+ },
517
+ "outputs": [
518
+ {
519
+ "name": "stderr",
520
+ "output_type": "stream",
521
+ "text": [
522
+ "ic| data[\"source\"]: 'རྨི་ལམ་ཡིན་སྙམ་དུ་བསམ༔'\n",
523
+ "ic| tokenized_data_8000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
524
+ "ic| tokenized_data_16000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
525
+ "ic| tokenized_data_32000: ['རྨི་ལམ་', 'ཡིན་', 'སྙམ་', 'དུ་', 'བསམ', '༔']\n",
526
+ "ic| data[\"source\"]: 'ཉམས་སྐྱེས་པ་ན་རླུང་སེམས་དྲག་ཏུ་གཅུན༔'\n",
527
+ "ic| tokenized_data_8000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུ', 'ན', '༔']\n",
528
+ "ic| tokenized_data_16000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n",
529
+ "ic| tokenized_data_32000: ['ཉམས་', 'སྐྱེས་པ་', 'ན', '་', 'རླུང་', 'སེམས་', 'དྲག་', 'ཏུ་', 'གཅུན', '༔']\n"
530
+ ]
531
+ }
532
+ ],
533
+ "source": [
534
+ "for index in range(10, len(remaining_dataset)):\n",
535
+ " data = remaining_dataset[index]\n",
536
+ " if index == 12:\n",
537
+ " break\n",
538
+ " ic(data[\"source\"])\n",
539
+ " tokenized_data_8000 = tokenizer_8000.tokenize(data[\"source\"])\n",
540
+ " ic(tokenized_data_8000)\n",
541
+ " tokenized_data_16000 = tokenizer_16000.tokenize(data[\"source\"])\n",
542
+ " ic(tokenized_data_16000)\n",
543
+ " tokenized_data_32000 = tokenizer_32000.tokenize(data[\"source\"])\n",
544
+ " ic(tokenized_data_32000)\n",
545
+ " \n"
546
+ ]
547
+ },
548
+ {
549
+ "cell_type": "code",
550
+ "execution_count": 240,
551
+ "metadata": {
552
+ "scrolled": true
553
+ },
554
+ "outputs": [
555
+ {
556
+ "name": "stderr",
557
+ "output_type": "stream",
558
+ "text": [
559
+ "ic| data: '༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།'\n",
560
+ "ic| tokenized_data_8000: ['༸གོང་ས་',\n",
561
+ " 'མཆོག་',\n",
562
+ " 'གི་',\n",
563
+ " 'བོད་',\n",
564
+ " 'དོན་',\n",
565
+ " 'འཐབ་རྩོད་',\n",
566
+ " 'དང་',\n",
567
+ " 'འབྲེལ་བ',\n",
568
+ " 'འི་',\n",
569
+ " 'ཕྱག་',\n",
570
+ " 'དེབ་',\n",
571
+ " 'གསར་པ་',\n",
572
+ " 'ཞིག་',\n",
573
+ " 'ཕྱི་',\n",
574
+ " 'ཟླ་',\n",
575
+ " 'གསུམ་པ',\n",
576
+ " 'འི་',\n",
577
+ " 'ནང་',\n",
578
+ " 'འདོན་',\n",
579
+ " 'སྤེལ་',\n",
580
+ " 'གནང་',\n",
581
+ " 'རྒྱུ',\n",
582
+ " '།']\n",
583
+ "ic| tokenized_data_16000: ['༸གོང་ས་',\n",
584
+ " 'མཆོག་',\n",
585
+ " 'གི་',\n",
586
+ " 'བོད་',\n",
587
+ " 'དོན་',\n",
588
+ " 'འཐབ་རྩོད་',\n",
589
+ " 'དང་',\n",
590
+ " 'འབྲེལ་བ',\n",
591
+ " 'འི་',\n",
592
+ " 'ཕྱག་',\n",
593
+ " 'དེབ་',\n",
594
+ " 'གསར་པ་',\n",
595
+ " 'ཞིག་',\n",
596
+ " 'ཕྱི་ཟླ',\n",
597
+ " '་',\n",
598
+ " 'གསུམ་པ',\n",
599
+ " 'འི་',\n",
600
+ " 'ནང་',\n",
601
+ " 'འདོན་',\n",
602
+ " 'སྤེལ་',\n",
603
+ " 'གནང་',\n",
604
+ " 'རྒྱུ',\n",
605
+ " '།']\n",
606
+ "ic| tokenized_data_32000: ['༸གོང་ས་',\n",
607
+ " 'མཆོག་',\n",
608
+ " 'གི་',\n",
609
+ " 'བོད་',\n",
610
+ " 'དོན་',\n",
611
+ " 'འཐབ་རྩོད་',\n",
612
+ " 'དང་',\n",
613
+ " 'འབྲེལ་བ',\n",
614
+ " 'འི་',\n",
615
+ " 'ཕྱག་',\n",
616
+ " 'དེབ་',\n",
617
+ " 'གསར་པ་',\n",
618
+ " 'ཞིག་',\n",
619
+ " 'ཕྱི་ཟླ',\n",
620
+ " '་',\n",
621
+ " 'གསུམ་པ',\n",
622
+ " 'འི་',\n",
623
+ " 'ནང་',\n",
624
+ " 'འདོན་',\n",
625
+ " 'སྤེལ་',\n",
626
+ " 'གནང་',\n",
627
+ " 'རྒྱུ',\n",
628
+ " '།']\n",
629
+ "ic| tokenizer_8000.encode(data): [0,\n",
630
+ " 2163,\n",
631
+ " 152,\n",
632
+ " 25,\n",
633
+ " 201,\n",
634
+ " 47,\n",
635
+ " 3426,\n",
636
+ " 9,\n",
637
+ " 662,\n",
638
+ " 7,\n",
639
+ " 267,\n",
640
+ " 1522,\n",
641
+ " 2426,\n",
642
+ " 59,\n",
643
+ " 256,\n",
644
+ " 636,\n",
645
+ " 348,\n",
646
+ " 7,\n",
647
+ " 85,\n",
648
+ " 1067,\n",
649
+ " 1238,\n",
650
+ " 717,\n",
651
+ " 246,\n",
652
+ " 5,\n",
653
+ " 1]\n"
654
+ ]
655
+ },
656
+ {
657
+ "data": {
658
+ "text/plain": [
659
+ "[0,\n",
660
+ " 2163,\n",
661
+ " 152,\n",
662
+ " 25,\n",
663
+ " 201,\n",
664
+ " 47,\n",
665
+ " 3426,\n",
666
+ " 9,\n",
667
+ " 662,\n",
668
+ " 7,\n",
669
+ " 267,\n",
670
+ " 1522,\n",
671
+ " 2426,\n",
672
+ " 59,\n",
673
+ " 256,\n",
674
+ " 636,\n",
675
+ " 348,\n",
676
+ " 7,\n",
677
+ " 85,\n",
678
+ " 1067,\n",
679
+ " 1238,\n",
680
+ " 717,\n",
681
+ " 246,\n",
682
+ " 5,\n",
683
+ " 1]"
684
+ ]
685
+ },
686
+ "execution_count": 240,
687
+ "metadata": {},
688
+ "output_type": "execute_result"
689
+ }
690
+ ],
691
+ "source": [
692
+ "data = \"༸གོང་ས་མཆོག་གི་བོད་དོན་འཐབ་རྩོད་དང་འབྲེལ་བའི་ཕྱག་དེབ་གསར་པ་ཞིག་ཕྱི་ཟླ་གསུམ་པའི་ནང་འདོན་སྤེལ་གནང་རྒྱུ།\"\n",
693
+ "ic(data) \n",
694
+ "tokenized_data_8000 = tokenizer_8000.tokenize(data)\n",
695
+ "ic(tokenized_data_8000)\n",
696
+ "tokenized_data_16000 = tokenizer_16000.tokenize(data)\n",
697
+ "ic(tokenized_data_16000)\n",
698
+ "tokenized_data_32000 = tokenizer_32000.tokenize(data)\n",
699
+ "ic(tokenized_data_32000)\n",
700
+ "ic(tokenizer_8000.encode(data))"
701
+ ]
702
+ },
703
+ {
704
+ "cell_type": "code",
705
+ "execution_count": null,
706
+ "metadata": {},
707
+ "outputs": [],
708
+ "source": []
709
+ },
710
+ {
711
+ "cell_type": "code",
712
+ "execution_count": 152,
713
+ "metadata": {},
714
+ "outputs": [
715
+ {
716
+ "data": {
717
+ "text/plain": [
718
+ "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
719
+ ]
720
+ },
721
+ "execution_count": 152,
722
+ "metadata": {},
723
+ "output_type": "execute_result"
724
+ }
725
+ ],
726
+ "source": [
727
+ "tokenized_data"
728
+ ]
729
+ },
730
+ {
731
+ "cell_type": "code",
732
+ "execution_count": 166,
733
+ "metadata": {},
734
+ "outputs": [
735
+ {
736
+ "data": {
737
+ "text/plain": [
738
+ "['སྣང་', 'གསུམ་', 'དབྱིངས་', 'སུ་', 'ཐིམ་པ', '་', 'ལས', '༔']"
739
+ ]
740
+ },
741
+ "execution_count": 166,
742
+ "metadata": {},
743
+ "output_type": "execute_result"
744
+ }
745
+ ],
746
+ "source": [
747
+ "tokenized_data_16000"
748
+ ]
749
+ },
750
+ {
751
+ "cell_type": "code",
752
+ "execution_count": null,
753
+ "metadata": {},
754
+ "outputs": [],
755
+ "source": [
756
+ "tokenized_data_32000"
757
+ ]
758
+ },
759
+ {
760
+ "cell_type": "markdown",
761
+ "metadata": {},
762
+ "source": [
763
+ "## Use your new tokenizer to train a language model!"
764
+ ]
765
+ },
766
+ {
767
+ "cell_type": "markdown",
768
+ "metadata": {},
769
+ "source": [
770
+ "You can either use your new tokenizer in the language modeling from scratch notebook [Link to come] or use the `--tokenizer_name` argument in the [language modeling scripts](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) to use it there to train a model from scratch."
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "code",
775
+ "execution_count": 241,
776
+ "metadata": {},
777
+ "outputs": [
778
+ {
779
+ "data": {
780
+ "application/vnd.jupyter.widget-view+json": {
781
+ "model_id": "a22f05e792ee40b79bf097340ae38a2a",
782
+ "version_major": 2,
783
+ "version_minor": 0
784
+ },
785
+ "text/plain": [
786
+ "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
787
+ ]
788
+ },
789
+ "metadata": {},
790
+ "output_type": "display_data"
791
+ }
792
+ ],
793
+ "source": [
794
+ "notebook_login()\n"
795
+ ]
796
+ },
797
+ {
798
+ "cell_type": "code",
799
+ "execution_count": 253,
800
+ "metadata": {},
801
+ "outputs": [
802
+ {
803
+ "ename": "HfHubHTTPError",
804
+ "evalue": "409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo",
805
+ "output_type": "error",
806
+ "traceback": [
807
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
808
+ "\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)",
809
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:406\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 406\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
810
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/requests/models.py:1024\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1023\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1024\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
811
+ "\u001b[0;31mHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create",
812
+ "\nThe above exception was the direct cause of the following exception:\n",
813
+ "\u001b[0;31mHfHubHTTPError\u001b[0m Traceback (most recent call last)",
814
+ "Cell \u001b[0;32mIn[253], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m login, create_repo, Repository\n\u001b[1;32m 3\u001b[0m repo_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mta4tsering/NLP-Unigram_language_model_tokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m repo_url \u001b[38;5;241m=\u001b[39m \u001b[43mcreate_repo\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprivate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRepository created: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrepo_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n",
815
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
816
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/hf_api.py:3525\u001b[0m, in \u001b[0;36mHfApi.create_repo\u001b[0;34m(self, repo_id, token, private, repo_type, exist_ok, resource_group_id, space_sdk, space_hardware, space_storage, space_sleep_time, space_secrets, space_variables)\u001b[0m\n\u001b[1;32m 3522\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 3524\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3525\u001b[0m \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mr\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3526\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 3527\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m exist_ok \u001b[38;5;129;01mand\u001b[39;00m err\u001b[38;5;241m.\u001b[39mresponse\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m409\u001b[39m:\n\u001b[1;32m 3528\u001b[0m \u001b[38;5;66;03m# Repo already exists and `exist_ok=True`\u001b[39;00m\n",
817
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_http.py:477\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 473\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m 475\u001b[0m \u001b[38;5;66;03m# Convert `HTTPError` into a `HfHubHTTPError` to display request information\u001b[39;00m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;66;03m# as well (request id and/or server error message)\u001b[39;00m\n\u001b[0;32m--> 477\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m _format(HfHubHTTPError, \u001b[38;5;28mstr\u001b[39m(e), response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n",
818
+ "\u001b[0;31mHfHubHTTPError\u001b[0m: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)\n\nYou already created this model repo"
819
+ ]
820
+ }
821
+ ],
822
+ "source": [
823
+ "from huggingface_hub import login, create_repo, Repository\n",
824
+ "\n",
825
+ "repo_name = \"ta4tsering/NLP-Unigram_language_model_tokenizer\"\n",
826
+ "repo_url = create_repo(repo_name, repo_type=\"model\", private=False)\n",
827
+ "print(f\"Repository created: {repo_url}\")"
828
+ ]
829
+ },
830
+ {
831
+ "cell_type": "code",
832
+ "execution_count": 248,
833
+ "metadata": {
834
+ "scrolled": true
835
+ },
836
+ "outputs": [
837
+ {
838
+ "name": "stdout",
839
+ "output_type": "stream",
840
+ "text": [
841
+ "Requirement already satisfied: huggingface_hub in /opt/conda/lib/python3.10/site-packages (0.27.1)\n",
842
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.13.1)\n",
843
+ "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.12.2)\n",
844
+ "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
845
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
846
+ "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.32.3)\n",
847
+ "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.67.1)\n",
848
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.9.0)\n",
849
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
850
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
851
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
852
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.11.17)\n",
853
+ "WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\n"
854
+ ]
855
+ }
856
+ ],
857
+ "source": [
858
+ "!pip install --upgrade huggingface_hub"
859
+ ]
860
+ },
861
+ {
862
+ "cell_type": "code",
863
+ "execution_count": 256,
864
+ "metadata": {
865
+ "scrolled": true
866
+ },
867
+ "outputs": [
868
+ {
869
+ "name": "stderr",
870
+ "output_type": "stream",
871
+ "text": [
872
+ "/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\n",
873
+ "For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n",
874
+ " warnings.warn(warning_message, FutureWarning)\n"
875
+ ]
876
+ },
877
+ {
878
+ "ename": "OSError",
879
+ "evalue": "Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).",
880
+ "output_type": "error",
881
+ "traceback": [
882
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
883
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
884
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:592\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m \u001b[43mrun_subprocess\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgit-lfs --version\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocal_dir\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n",
885
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_subprocess.py:83\u001b[0m, in \u001b[0;36mrun_subprocess\u001b[0;34m(command, folder, check, **kwargs)\u001b[0m\n\u001b[1;32m 81\u001b[0m folder \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(folder)\n\u001b[0;32m---> 83\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrun\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcommand\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mstderr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43mstdout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msubprocess\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mPIPE\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheck\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mutf-8\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreplace\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# if not utf-8, replace char by �\u001b[39;49;00m\n\u001b[1;32m 90\u001b[0m \u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfolder\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetcwd\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 91\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 92\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
886
+ "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:503\u001b[0m, in \u001b[0;36mrun\u001b[0;34m(input, capture_output, timeout, check, *popenargs, **kwargs)\u001b[0m\n\u001b[1;32m 501\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstderr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m PIPE\n\u001b[0;32m--> 503\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mPopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpopenargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m process:\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
887
+ "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:971\u001b[0m, in \u001b[0;36mPopen.__init__\u001b[0;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mTextIOWrapper(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstderr,\n\u001b[1;32m 969\u001b[0m encoding\u001b[38;5;241m=\u001b[39mencoding, errors\u001b[38;5;241m=\u001b[39merrors)\n\u001b[0;32m--> 971\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_execute_child\u001b[49m\u001b[43m(\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mexecutable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpreexec_fn\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclose_fds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 972\u001b[0m \u001b[43m \u001b[49m\u001b[43mpass_fds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43menv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 973\u001b[0m \u001b[43m \u001b[49m\u001b[43mstartupinfo\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreationflags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshell\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 974\u001b[0m \u001b[43m \u001b[49m\u001b[43mp2cread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mp2cwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 975\u001b[0m \u001b[43m \u001b[49m\u001b[43mc2pread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mc2pwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43merrread\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrwrite\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43mrestore_signals\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mgid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muid\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mumask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mstart_new_session\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 980\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 981\u001b[0m \u001b[38;5;66;03m# Cleanup if the child failed starting.\u001b[39;00m\n",
888
+ "File \u001b[0;32m/opt/conda/lib/python3.10/subprocess.py:1863\u001b[0m, in \u001b[0;36mPopen._execute_child\u001b[0;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)\u001b[0m\n\u001b[1;32m 1862\u001b[0m err_msg \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mstrerror(errno_num)\n\u001b[0;32m-> 1863\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(errno_num, err_msg, err_filename)\n\u001b[1;32m 1864\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m child_exception_type(err_msg)\n",
889
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'git-lfs'",
890
+ "\nDuring handling of the above exception, another exception occurred:\n",
891
+ "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
892
+ "Cell \u001b[0;32mIn[256], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m repo_path \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/home\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m repo \u001b[38;5;241m=\u001b[39m \u001b[43mRepository\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlocal_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mclone_from\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_url\u001b[49m\u001b[43m)\u001b[49m\n",
893
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m 112\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
894
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_deprecation.py:132\u001b[0m, in \u001b[0;36m_deprecate_method.<locals>._inner_deprecate_method.<locals>.inner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 130\u001b[0m warning_message \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m message\n\u001b[1;32m 131\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(warning_message, \u001b[38;5;167;01mFutureWarning\u001b[39;00m)\n\u001b[0;32m--> 132\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
895
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:522\u001b[0m, in \u001b[0;36mRepository.__init__\u001b[0;34m(self, local_dir, clone_from, repo_type, token, git_user, git_email, revision, skip_lfs_files, client)\u001b[0m\n\u001b[1;32m 519\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mskip_lfs_files \u001b[38;5;241m=\u001b[39m skip_lfs_files\n\u001b[1;32m 520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient \u001b[38;5;241m=\u001b[39m client \u001b[38;5;28;01mif\u001b[39;00m client \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m HfApi()\n\u001b[0;32m--> 522\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_git_versions\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(token, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 525\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhuggingface_token: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m token\n",
896
+ "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/huggingface_hub/repository.py:594\u001b[0m, in \u001b[0;36mRepository.check_git_versions\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 592\u001b[0m lfs_version \u001b[38;5;241m=\u001b[39m run_subprocess(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgit-lfs --version\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlocal_dir)\u001b[38;5;241m.\u001b[39mstdout\u001b[38;5;241m.\u001b[39mstrip()\n\u001b[1;32m 593\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n\u001b[0;32m--> 594\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mEnvironmentError\u001b[39;00m(\n\u001b[1;32m 595\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLooks like you do not have git-lfs installed, please install.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 596\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m You can install from https://git-lfs.github.com/.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 597\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m Then run `git lfs install` (you only have to do this once).\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 598\u001b[0m )\n\u001b[1;32m 599\u001b[0m logger\u001b[38;5;241m.\u001b[39minfo(git_version \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m lfs_version)\n",
897
+ "\u001b[0;31mOSError\u001b[0m: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once)."
898
+ ]
899
+ }
900
+ ],
901
+ "source": [
902
+ "repo_path = \"/home\"\n",
903
+ "repo = Repository(local_dir=repo_path, clone_from=repo_url)"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "code",
908
+ "execution_count": null,
909
+ "metadata": {},
910
+ "outputs": [],
911
+ "source": [
912
+ "import os\n",
913
+ "from huggingface_hub import upload_file\n",
914
+ "\n",
915
+ "# Define the local folder and repo_id\n",
916
+ "folder_path = \"/home/NLP-Unigram_language_model_tokenizer/\" # Local folder path (should match your repo_id)\n",
917
+ "repo_id = \"ta4tsering/NLP-Unigram_language_model_tokenizer\" # Replace with your Hugging Face repo ID\n",
918
+ "\n",
919
+ "# Iterate through all files in the folder\n",
920
+ "for root, _, files in os.walk(folder_path):\n",
921
+ " for file_name in files:\n",
922
+ " local_file_path = os.path.join(root, file_name)\n",
923
+ " repo_file_path = os.path.relpath(local_file_path, folder_path) # Keep folder structure\n",
924
+ "\n",
925
+ " # Upload file to the repo\n",
926
+ " upload_file(\n",
927
+ " path_or_fileobj=local_file_path,\n",
928
+ " path_in_repo=repo_file_path,\n",
929
+ " repo_id=repo_id,\n",
930
+ " repo_type=\"model\",\n",
931
+ " commit_message=f\"Add {repo_file_path}\",\n",
932
+ " )\n",
933
+ " print(f\"Uploaded {repo_file_path}\")\n"
934
+ ]
935
+ },
936
+ {
937
+ "cell_type": "code",
938
+ "execution_count": 257,
939
+ "metadata": {},
940
+ "outputs": [
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "git: 'lfs' is not a git command. See 'git --help'.\n",
946
+ "\n",
947
+ "The most similar command is\n",
948
+ "\tlog\n"
949
+ ]
950
+ }
951
+ ],
952
+ "source": [
953
+ "!git lfs install"
954
+ ]
955
+ },
956
+ {
957
+ "cell_type": "code",
958
+ "execution_count": null,
959
+ "metadata": {},
960
+ "outputs": [],
961
+ "source": [
962
+ "repo.push_to_hub(commit_message=\"Initial commit\")\n",
963
+ "print(\"Files pushed to Hugging Face!\")"
964
+ ]
965
+ }
966
+ ],
967
+ "metadata": {
968
+ "colab": {
969
+ "name": "Train your tokenizer",
970
+ "provenance": []
971
+ },
972
+ "kernelspec": {
973
+ "display_name": "Python 3 (ipykernel)",
974
+ "language": "python",
975
+ "name": "python3"
976
+ },
977
+ "language_info": {
978
+ "codemirror_mode": {
979
+ "name": "ipython",
980
+ "version": 3
981
+ },
982
+ "file_extension": ".py",
983
+ "mimetype": "text/x-python",
984
+ "name": "python",
985
+ "nbconvert_exporter": "python",
986
+ "pygments_lexer": "ipython3",
987
+ "version": "3.10.13"
988
+ }
989
+ },
990
+ "nbformat": 4,
991
+ "nbformat_minor": 4
992
+ }