Spaces:

xu-song
/

tokenizer-arena

Running

File size: 4,650 Bytes
"""



## characters



- alphanumeric characters

- numeric characters

- special characters: A special character is a character that is not an alphabetic or numeric character.

    - ASCII control characters

    - punctuation marks

    - accent marks

    - 数学符号

    - whitespace:

        - https://en.wikipedia.org/wiki/Whitespace_character

        - https://emptycharacter.com/





https://www.computerhope.com/jargon/s/specchar.htm

"""

import random
from datasets import load_dataset

default_user_input = """\

Replace this text in the input field to see how tokenization works.

Buenos días!

华为发布Mate60手机。

ラグビーワールドカップ2023フランス"""
# default_tokenizer_name_1 = "Meta/llama3"
# default_tokenizer_name_1 = "gradientai/Llama-3-8B-Instruct-Gradient-1048k"
default_tokenizer_name_1 = "deepseek-ai/DeepSeek-R1"
default_tokenizer_name_2 = "openai/gpt-4o"


def get_sample_input():
    default_inputs = {
        "en": "Replace this text in the input field to see how tokenization works.",
        "zh-Hans": "",
        "es": "",
        "de": "",
    }
    random.seed(10)  # For reproducibility
    lines = []
    for lang in default_inputs.keys():
        dataset = load_dataset("eson/cc100-samples", lang, split="train")
        print(dataset)
        print(1)
    return default_inputs


examples = {
    "en": [
        ["number: (10086 + 98) = 100184", "huggyllama/llama-7b", "bigscience/bloom"],  #
        [
            "whitespace:  2spaces        8spaces\t1tab\t\t2tab\n1newline",
            "huggyllama/llama-7b",
            "google-bert/bert-base-cased",
        ],  # chatglm 有blank_n, bert丢掉了空格，
        # ！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
        [
            'punctuation: ,.:/?+="，。！？；【】〔〕〖〗',
            "google/gemma-7b",
            "huggyllama/llama-7b",
        ],  # llama词典有点小
        [
            "symbol: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
            "baichuan-inc/Baichuan-7B",
            "huggyllama/llama-7b",
        ],
        # ["special: [PAD] [UNK] [CLS] [SEP] [MASK] <|system|> <|user|> <|assistant|> <|endoftext|>", "", ""],
    ],
    "zh": [
        [
            "空格测试：  2个空格        8个空格",
            "llama",
            "chatglm2_6b",
        ],  # chatglm 有blank_n,
        ["标点测试：，。！？；", "baichuan_7b", "llama"],
        [
            "符号测试：🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤",
            "baichuan_7b",
            "llama",
        ],
        ["数字测试：(10086 + 98) = 100184", "baichuan_7b", "llama"],
        ["中文简体：宽带，繁体：樂來", "baichuan_7b", "llama"],
    ],
}


more_examples = [
    # bert系列
    (
        "google-bert/bert-base-cased",
        "google-bert/bert-base-uncased",
        "",
        "",
    ),  # # clue VS kplug， bert VS clue
    ("bert-base-cased", "clue", "", "增加了[]()"),
    ("roberta-chinese-clue", "kplug", "", ""),
    # llama系列 (基于sentencepiece)
    (
        "baichuan",
        "baichuan2",
        "baichuan2支持多空格   ，多个换行\n\n\n，do not add dummy prefix as Baichuan1",
    ),
    ("llama", "baichuan2", "baichuan2支持多空格   ，多个换行\n\n"),
    ("llama", "chinese-llama-2-7b", ""),
    ("llama", "llama3", "扩充词典"),
    ("chinese-llama-lora-7b", "chinese-llama-2-7b", ""),
    # glm系列 （基于sentencepiece）
    ("glm", "chatglm1", ""),
    ("chatglm1", "chatglm2", ""),
    # gpt2系列
    ("gpt2", "moss", ""),
    ("", "", ""),
    # openai系列 （tiktoken）
    ("qwen", "gpt_35_turbo", ""),
]

lang = "en"

example_types = [t[0].split(":")[0] for t in examples[lang]]


def example_fn(example_idx):
    return examples[lang][example_idx]


def get_more_example():
    import urllib.parse

    url_prefix = "https://huggingface.co/spaces/eson/tokenizer-arena"
    for tokenizer1, tokenizer2, text, comment in more_examples:
        full_url = f"{url_prefix}?tokenizer1={tokenizer1}&tokenizer2={tokenizer2}&text={urllib.parse.quote(text)}"
        print(full_url)


if __name__ == "__main__":
    get_more_example()