In [213]:
!pip install datasets transformers[sentencepiece]



In [214]:
!nvidia-smi

Sun Jan 26 12:49:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.76                 Driver Version: 550.76         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  |   00000000:E2:00.0 Off |                  Off |
|  0%   31C    P8             19W /  450W |       1MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

If you're opening this notebook locally, make sure your environment has an install from the last version of Datasets and a source install of Transformers.

In [215]:
!pip install huggingface_hub



In [216]:
!git config --global credential.helper store

## Getting a corpus

We will need texts to train our tokenizer. We will use the [ü§ó Datasets](https://github.com/huggingface/datasets) library to download our text data, which can be easily done with the `load_dataset` function:

In [217]:
from datasets import load_dataset

In [218]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [219]:
dataset = load_dataset("openpecha/deduplication_combined_word_seg_data", name="", split="train")

In [220]:
manual_dataset = dataset.filter(lambda x: x["filename"] == "manual_data.json", num_proc=10)

In [221]:
manual_dataset[0]

{'source': '‡Ωë‡ºã‡Ω£‡æü‡Ω¢‡ºã‡Ωñ‡ºã‡Ω°‡Ωº‡Ωë‡ºã‡ΩÖ‡Ω≤‡ºã‡Ωû‡Ω≤‡ΩÇ‡ºã‡Ωò‡Ω∫‡Ωë‡ºç ‡ºç‡ΩÇ‡ΩÑ‡ºã‡ΩÇ‡Ω≤‡ºã‡Ωë‡Ω¥‡Ω¶‡ºã‡ΩÄ‡Ω¥‡Ωì‡ºã‡Ω°‡Ωº‡Ωë‡ºã‡Ωâ‡Ω≤‡Ωë‡ºã‡Ωî‡ºç ‡ºç‡Ωë‡Ω∫‡ºã‡Ω°‡Ω≤‡ºã‡Ωò‡Ω≤‡ºã‡Ω¢‡æü‡ΩÇ‡ºã‡Ωâ‡Ω≤‡Ωë‡ºã‡ΩÇ‡ΩÑ‡ºã‡Ω£‡Ω¶‡ºç ‡ºç ‡Ω†‡Ωë‡Ω¶‡ºã‡Ωî‡ºã‡Ω£‡Ω¶‡ºã‡Ωì‡Ω≤‡ºã‡Ω†‡Ωë‡Ω¶‡ºã‡ΩÇ‡æ±‡Ω¥‡Ω¢‡ºã‡Ωî‡ºç ‡ºç‡ΩÖ‡Ω≤‡ºã‡Ω°‡Ω≤‡ºã‡Ωï‡æ±‡Ω≤‡Ω¢‡ºã‡Ωì‡ºã‡Ω†‡Ωë‡Ω¶‡ºã‡Ωî‡Ω¢‡ºã‡Ω†‡ΩÇ‡æ±‡Ω¥‡Ω¢‡ºç ‡ºç ‡Ω†‡Ωë‡Ω¶‡ºã‡Ωî‡ºã‡Ω£‡Ω¶‡ºã‡Ωì‡Ω≤‡ºã‡Ωò‡ºã‡Ω†‡Ωë‡Ω¶‡ºã‡Ωî‡ºç ‡ºç‡ΩÖ‡Ω≤‡ºã‡Ω°‡Ω≤‡ºã‡Ωï‡æ±‡Ω≤‡Ω¢‡ºã‡Ωì‡ºã‡Ω†‡Ωë‡Ω¶‡ºã‡Ωî‡Ω¢‡ºã‡Ω†‡ΩÇ‡æ±‡Ω¥‡Ω¢‡ºç ‡ºç‡ΩÇ‡Ω£‡ºã‡Ωè‡Ω∫‡ºã‡Ωò‡ºã‡Ω†‡Ωº‡ΩÑ‡Ω¶‡ºã‡Ω¶‡æê‡æ±‡Ω∫‡Ω¶‡ºã‡Ω°‡Ωº‡Ωë‡ºã‡Ωì‡ºç ‡ºç‡Ωá‡Ω≤‡ºã‡Ω£‡æü‡Ω¢‡ºã‡Ωë‡ºã‡Ω£‡æü‡Ω¢‡ºã‡Ωñ‡Ω¢‡ºã‡Ωò‡Ω≤‡ºã‡Ω†‡ΩÇ‡æ±‡Ω¥‡Ω¢‡ºç ‡ºç‡ΩÖ‡Ω≤‡ºã‡Ω¶‡æü‡Ω∫‡ºã‡Ωë‡Ω∫‡ºã‡Ω£‡ºã‡Ω¶‡æê‡æ±‡Ω∫‡ºã‡Ωò‡Ω∫‡Ωë‡ºã‡Ωì‡ºç ‡ºç‡Ωò‡ºã‡Ω†‡Ωº‡ΩÑ‡Ω¶‡ºã‡Ω¢‡æü‡ΩÇ‡ºã‡Ωî‡Ω¢‡ºã‡Ω†‡ΩÇ‡æ±‡Ω¥‡Ω¢‡ºã‡Ω¢‡Ωò‡ºã‡ΩÖ‡Ω≤‡ºç ‡ºç ‡Ω¶‡æê‡æ±‡Ω∫‡ºã‡Ωñ‡ºã‡Ωò‡Ω∫‡Ωë‡ºã‡ΩÄ‡æ±‡ΩÑ‡ºã‡Ω†‡Ωá‡Ω≤‡ΩÇ‡ºã‡Ωî‡ºã‡Ω£‡Ω¶‡ºç ‡ºç‡ΩÇ‡Ω£‡ºã‡Ωè‡Ω∫‡ºã‡Ωò‡ºã‡Ω†‡Ωº‡ΩÑ‡Ω¶‡ºã‡Ωò‡Ω≤‡ºã‡Ω¢‡æü‡ΩÇ‡ºã‡Ωì‡ºç ‡ºç

In [232]:
len(manual_dataset)

20278

In [222]:
remaining_dataset = dataset.filter(lambda x: x["filename"] != "manual_data.json", num_proc=10)

In [223]:
remaining_dataset[9:10]

{'source': ['‡ΩÇ‡Ωô‡Ωº‡ºã‡Ωò‡Ωº‡Ω†‡Ω≤‡ºã‡Ωò‡ΩÇ‡æ≤‡Ω≤‡Ωì‡ºã‡Ωî‡Ω¢‡ºã‡Ω®‡æÉ‡ºã‡Ωë‡Ωò‡Ω¢‡ºã‡Ωî‡Ωº‡ºã‡Ω†‡Ωñ‡Ω¢‡ºã‡Ωñ‡ºã‡Ω£‡ºã‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã‡ΩÇ‡Ωü‡Ω¥‡ΩÑ‡ºã‡ºî'],
 'target': ['‡ΩÇ‡Ωô‡Ωº‡ºã ‡Ωò‡Ωº ‡Ω†‡Ω≤‡ºã ‡Ωò‡ΩÇ‡æ≤‡Ω≤‡Ωì‡ºã‡Ωî ‡Ω¢‡ºã ‡Ω®‡æÉ‡ºã ‡Ωë‡Ωò‡Ω¢‡ºã‡Ωî‡Ωº‡ºã ‡Ω†‡Ωñ‡Ω¢‡ºã‡Ωñ‡ºã ‡Ω£‡ºã ‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã ‡ΩÇ‡Ωü‡Ω¥‡ΩÑ‡ºã ‡ºî'],
 'filename': ['UT3JT13384-005-0028.txt']}

### Unigram model like Albert

Let's now have a look at how we can create a Unigram tokenizer like the one used for training T5. The first step is to create a `Tokenizer` with an empty `Unigram` model:

In [224]:
!pip install tokenizers icecream



In [225]:
from tokenizers import Tokenizer, decoders
from tokenizers.models import Unigram
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers import trainers
from icecream import ic

tokenizer = Tokenizer(Unigram())

In [None]:
def batch_iterator(dataset):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["target"]

If we want to have a quick look at how it preprocesses the inputs, we can call the `pre_tokenize_str` method:

In [None]:
vocab_count=32000
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [None]:
trainer = trainers.UnigramTrainer(vocab_size=vocab_count, special_tokens=["[CLS]", "[SEP]", "<unk>", "<pad>", "[MASK]"], unk_token="<unk>")
tokenizer.train_from_iterator(batch_iterator(manual_dataset), trainer=trainer)

In [None]:
tokenizer.save(f"./trained_tokenizer_{vocab_count}.json")

In [None]:
# Load the saved tokenizer
tokenizer = Tokenizer.from_file(f"./trained_tokenizer_{vocab_count}.json")


In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")

In [226]:
from tokenizers import processors
from tokenizers import Tokenizer, models, processors, decoders

In [234]:
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS]:0 $A:0 [SEP]:0",
    pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)
tokenizer.decoder = decoders.CTC()

In [235]:
tokenizer_8000 = Tokenizer.from_file(f"./trained_tokenizer_8000.json")
tokenizer_16000 = Tokenizer.from_file(f"./trained_tokenizer_16000.json")
tokenizer_32000 = Tokenizer.from_file(f"./trained_tokenizer_32000.json")

In [236]:
from transformers import AlbertTokenizerFast

tokenizer_8000 = AlbertTokenizerFast(tokenizer_object=tokenizer_8000)
tokenizer_16000 = AlbertTokenizerFast(tokenizer_object=tokenizer_16000)
tokenizer_32000 = AlbertTokenizerFast(tokenizer_object=tokenizer_32000)
##tokenizer_64000 = AlbertTokenizerFast(tokenizer_object=tokenizer_64000)

In [237]:
remaining_dataset[10]

{'source': '‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã‡Ω°‡Ω≤‡Ωì‡ºã‡Ω¶‡æô‡Ωò‡ºã‡Ωë‡Ω¥‡ºã‡Ωñ‡Ω¶‡Ωò‡ºî',
 'target': '‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã ‡Ω°‡Ω≤‡Ωì‡ºã ‡Ω¶‡æô‡Ωò‡ºã ‡Ωë‡Ω¥‡ºã ‡Ωñ‡Ω¶‡Ωò ‡ºî',
 'filename': 'UT3JT13384-005-0028.txt'}

In [231]:
for index in range(10, len(remaining_dataset)):
    data = remaining_dataset[index]
    if index == 12:
        break
    ic(data["source"])
    tokenized_data_8000 = tokenizer_8000.tokenize(data["source"])
    ic(tokenized_data_8000)
    tokenized_data_16000 = tokenizer_16000.tokenize(data["source"])
    ic(tokenized_data_16000)
    tokenized_data_32000 = tokenizer_32000.tokenize(data["source"])
    ic(tokenized_data_32000)
    


ic| data["source"]: '‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã‡Ω°‡Ω≤‡Ωì‡ºã‡Ω¶‡æô‡Ωò‡ºã‡Ωë‡Ω¥‡ºã‡Ωñ‡Ω¶‡Ωò‡ºî'
ic| tokenized_data_8000: ['‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã', '‡Ω°‡Ω≤‡Ωì‡ºã', '‡Ω¶‡æô‡Ωò‡ºã', '‡Ωë‡Ω¥‡ºã', '‡Ωñ‡Ω¶‡Ωò', '‡ºî']
ic| tokenized_data_16000: ['‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã', '‡Ω°‡Ω≤‡Ωì‡ºã', '‡Ω¶‡æô‡Ωò‡ºã', '‡Ωë‡Ω¥‡ºã', '‡Ωñ‡Ω¶‡Ωò', '‡ºî']
ic| tokenized_data_32000: ['‡Ω¢‡æ®‡Ω≤‡ºã‡Ω£‡Ωò‡ºã', '‡Ω°‡Ω≤‡Ωì‡ºã', '‡Ω¶‡æô‡Ωò‡ºã', '‡Ωë‡Ω¥‡ºã', '‡Ωñ‡Ω¶‡Ωò', '‡ºî']
ic| data["source"]: '‡Ωâ‡Ωò‡Ω¶‡ºã‡Ω¶‡æê‡æ±‡Ω∫‡Ω¶‡ºã‡Ωî‡ºã‡Ωì‡ºã‡Ω¢‡æ≥‡Ω¥‡ΩÑ‡ºã‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã‡Ωë‡æ≤‡ΩÇ‡ºã‡Ωè‡Ω¥‡ºã‡ΩÇ‡ΩÖ‡Ω¥‡Ωì‡ºî'
ic| tokenized_data_8000: ['‡Ωâ‡Ωò‡Ω¶‡ºã', '‡Ω¶‡æê‡æ±‡Ω∫‡Ω¶‡ºã‡Ωî‡ºã', '‡Ωì', '‡ºã', '‡Ω¢‡æ≥‡Ω¥‡ΩÑ‡ºã', '‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã', '‡Ωë‡æ≤‡ΩÇ‡ºã', '‡Ωè‡Ω¥‡ºã', '‡ΩÇ‡ΩÖ‡Ω¥', '‡Ωì', '‡ºî']
ic| tokenized_data_16000: ['‡Ωâ‡Ωò‡Ω¶‡ºã', '‡Ω¶‡æê‡æ±‡Ω∫‡Ω¶‡ºã‡Ωî‡ºã', '‡Ωì', '‡ºã', '‡Ω¢‡æ≥‡Ω¥‡ΩÑ‡ºã', '‡Ω¶‡Ω∫‡Ωò‡Ω¶‡ºã', '‡Ωë‡æ≤‡ΩÇ‡ºã', '‡Ωè‡Ω¥‡ºã', '‡ΩÇ‡ΩÖ‡Ω¥‡Ωì', '‡ºî']
ic| tokenized_data_32000: ['‡Ωâ‡Ωò‡Ω¶‡ºã', '‡Ω¶‡æê‡æ±‡Ω∫‡Ω¶‡ºã‡Ωî‡ºã', '‡Ωì',

In [240]:
data = "‡º∏‡ΩÇ‡Ωº‡ΩÑ‡ºã‡Ω¶‡ºã‡Ωò‡ΩÜ‡Ωº‡ΩÇ‡ºã‡ΩÇ‡Ω≤‡ºã‡Ωñ‡Ωº‡Ωë‡ºã‡Ωë‡Ωº‡Ωì‡ºã‡Ω†‡Ωê‡Ωñ‡ºã‡Ω¢‡æ©‡Ωº‡Ωë‡ºã‡Ωë‡ΩÑ‡ºã‡Ω†‡Ωñ‡æ≤‡Ω∫‡Ω£‡ºã‡Ωñ‡Ω†‡Ω≤‡ºã‡Ωï‡æ±‡ΩÇ‡ºã‡Ωë‡Ω∫‡Ωñ‡ºã‡ΩÇ‡Ω¶‡Ω¢‡ºã‡Ωî‡ºã‡Ωû‡Ω≤‡ΩÇ‡ºã‡Ωï‡æ±‡Ω≤‡ºã‡Ωü‡æ≥‡ºã‡ΩÇ‡Ω¶‡Ω¥‡Ωò‡ºã‡Ωî‡Ω†‡Ω≤‡ºã‡Ωì‡ΩÑ‡ºã‡Ω†‡Ωë‡Ωº‡Ωì‡ºã‡Ω¶‡æ§‡Ω∫‡Ω£‡ºã‡ΩÇ‡Ωì‡ΩÑ‡ºã‡Ω¢‡æí‡æ±‡Ω¥‡ºç"
ic(data) 
tokenized_data_8000 = tokenizer_8000.tokenize(data)
ic(tokenized_data_8000)
tokenized_data_16000 = tokenizer_16000.tokenize(data)
ic(tokenized_data_16000)
tokenized_data_32000 = tokenizer_32000.tokenize(data)
ic(tokenized_data_32000)
ic(tokenizer_8000.encode(data))

ic| data: '‡º∏‡ΩÇ‡Ωº‡ΩÑ‡ºã‡Ω¶‡ºã‡Ωò‡ΩÜ‡Ωº‡ΩÇ‡ºã‡ΩÇ‡Ω≤‡ºã‡Ωñ‡Ωº‡Ωë‡ºã‡Ωë‡Ωº‡Ωì‡ºã‡Ω†‡Ωê‡Ωñ‡ºã‡Ω¢‡æ©‡Ωº‡Ωë‡ºã‡Ωë‡ΩÑ‡ºã‡Ω†‡Ωñ‡æ≤‡Ω∫‡Ω£‡ºã‡Ωñ‡Ω†‡Ω≤‡ºã‡Ωï‡æ±‡ΩÇ‡ºã‡Ωë‡Ω∫‡Ωñ‡ºã‡ΩÇ‡Ω¶‡Ω¢‡ºã‡Ωî‡ºã‡Ωû‡Ω≤‡ΩÇ‡ºã‡Ωï‡æ±‡Ω≤‡ºã‡Ωü‡æ≥‡ºã‡ΩÇ‡Ω¶‡Ω¥‡Ωò‡ºã‡Ωî‡Ω†‡Ω≤‡ºã‡Ωì‡ΩÑ‡ºã‡Ω†‡Ωë‡Ωº‡Ωì‡ºã‡Ω¶‡æ§‡Ω∫‡Ω£‡ºã‡ΩÇ‡Ωì‡ΩÑ‡ºã‡Ω¢‡æí‡æ±‡Ω¥‡ºç'
ic| tokenized_data_8000: ['‡º∏‡ΩÇ‡Ωº‡ΩÑ‡ºã‡Ω¶‡ºã',
                          '‡Ωò‡ΩÜ‡Ωº‡ΩÇ‡ºã',
                          '‡ΩÇ‡Ω≤‡ºã',
                          '‡Ωñ‡Ωº‡Ωë‡ºã',
                          '‡Ωë‡Ωº‡Ωì‡ºã',
                          '‡Ω†‡Ωê‡Ωñ‡ºã‡Ω¢‡æ©‡Ωº‡Ωë‡ºã',
                          '‡Ωë‡ΩÑ‡ºã',
                          '‡Ω†‡Ωñ‡æ≤‡Ω∫‡Ω£‡ºã‡Ωñ',
                          '‡Ω†‡Ω≤‡ºã',
                          '‡Ωï‡æ±‡ΩÇ‡ºã',
                          '‡Ωë‡Ω∫‡Ωñ‡ºã',
                          '‡ΩÇ‡Ω¶‡Ω¢‡ºã‡Ωî‡ºã',
                          '‡Ωû‡Ω≤‡ΩÇ‡ºã',
                          '‡Ωï‡æ±‡Ω≤‡ºã',
                          '‡Ωü‡æ≥‡ºã',
                        

[0,
 2163,
 152,
 25,
 201,
 47,
 3426,
 9,
 662,
 7,
 267,
 1522,
 2426,
 59,
 256,
 636,
 348,
 7,
 85,
 1067,
 1238,
 717,
 246,
 5,
 1]

In [152]:
tokenized_data

['‡Ω¶‡æ£‡ΩÑ‡ºã', '‡ΩÇ‡Ω¶‡Ω¥‡Ωò‡ºã', '‡Ωë‡Ωñ‡æ±‡Ω≤‡ΩÑ‡Ω¶‡ºã', '‡Ω¶‡Ω¥‡ºã', '‡Ωê‡Ω≤‡Ωò‡ºã‡Ωî', '‡ºã', '‡Ω£‡Ω¶', '‡ºî']

In [166]:
tokenized_data_16000

['‡Ω¶‡æ£‡ΩÑ‡ºã', '‡ΩÇ‡Ω¶‡Ω¥‡Ωò‡ºã', '‡Ωë‡Ωñ‡æ±‡Ω≤‡ΩÑ‡Ω¶‡ºã', '‡Ω¶‡Ω¥‡ºã', '‡Ωê‡Ω≤‡Ωò‡ºã‡Ωî', '‡ºã', '‡Ω£‡Ω¶', '‡ºî']

In [None]:
tokenized_data_32000

## Use your new tokenizer to train a language model!

You can either use your new tokenizer in the language modeling from scratch notebook [Link to come] or use the `--tokenizer_name` argument in the [language modeling scripts](https://github.com/huggingface/transformers/tree/master/examples/pytorch/language-modeling) to use it there to train a model from scratch.

In [241]:
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [253]:
from huggingface_hub import login, create_repo, Repository

repo_name = "ta4tsering/NLP-Unigram_language_model_tokenizer"
repo_url = create_repo(repo_name, repo_type="model", private=False)
print(f"Repository created: {repo_url}")

HfHubHTTPError: 409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6796358c-51a27b1e2f84e1b55e1c5564;b3b3b6a1-c6c3-443d-a173-3b62474886bf)

You already created this model repo

In [248]:
!pip install --upgrade huggingface_hub



In [256]:
repo_path = "/home"
repo = Repository(local_dir=repo_path, clone_from=repo_url)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.


OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).

In [None]:
import os
from huggingface_hub import upload_file

# Define the local folder and repo_id
folder_path = "/home/NLP-Unigram_language_model_tokenizer/"  # Local folder path (should match your repo_id)
repo_id = "ta4tsering/NLP-Unigram_language_model_tokenizer"  # Replace with your Hugging Face repo ID

# Iterate through all files in the folder
for root, _, files in os.walk(folder_path):
    for file_name in files:
        local_file_path = os.path.join(root, file_name)
        repo_file_path = os.path.relpath(local_file_path, folder_path)  # Keep folder structure

        # Upload file to the repo
        upload_file(
            path_or_fileobj=local_file_path,
            path_in_repo=repo_file_path,
            repo_id=repo_id,
            repo_type="model",
            commit_message=f"Add {repo_file_path}",
        )
        print(f"Uploaded {repo_file_path}")


In [257]:
!git lfs install

git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log


In [None]:
repo.push_to_hub(commit_message="Initial commit")
print("Files pushed to Hugging Face!")