|
""" |
|
|
|
中文数据:clue superclue |
|
英文数据:glue cnn_dailymail gigaword |
|
代码数据: |
|
数字: |
|
|
|
## 参考 |
|
- https://github.com/baichuan-inc/Baichuan-7B 记录了不同分词器的压缩率 |
|
- 指标:猜测是 n_tokens/n_chars (baichuan小,说明百川token少,压缩率高) |
|
- Baichuan 0.73; llama 1.31; |
|
- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md 记录了不同分词器的压缩率 |
|
- 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) , |
|
- Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate) |
|
- 中文: llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9; qwen7b 0.95 |
|
- 英文: |
|
- 指标:猜测是 n_tokens / n_tokens_xlmR |
|
- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134 |
|
- 有压缩率的计算方式 |
|
- https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer |
|
- 记录了不同分词器的压缩率 |
|
- 指标: |
|
- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24 |
|
- 有压缩率的计算方式 = {n_chars} / {n_tokens} |
|
- |
|
- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354 |
|
- 这个可能不是 |
|
- https://github.com/bojone/bytepiece/blob/main/README_en.md |
|
- "bytes/token": the average number of bytes per token |
|
- Getting the most out of your tokenizer for pre-training and domain adaptation 👍 |
|
- 定义: |
|
- NSL: 两个分词器的编码长度 比例,通常以 llama为基准 |
|
- average number of bytes per token. {n_bytes} / {n_tokens} |
|
- higher compression rate -- |
|
- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py |
|
- 定义:{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0,用 % 来表示。也有>1的情况。 |
|
- |
|
- {Compressed Size} 指的是? |
|
- 这里的压缩指的是 模型参数相关的。 |
|
""" |
|
|
|
import json |
|
import os |
|
import pandas as pd |
|
from datasets import load_dataset |
|
from utils.log_util import logger |
|
from vocab import load_tokener |
|
|
|
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
|
def get_n_bytes_of_string(string_text): |
|
n_bytes = len(string_text.encode("utf-8")) |
|
return n_bytes |
|
|
|
|
|
def unit_convertor(stat, unit): |
|
n_tokens = stat["n_tokens"] |
|
n_chars = stat["n_chars"] |
|
n_bytes = stat["n_bytes"] |
|
|
|
n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000) |
|
n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000) |
|
n_bytes_in_mb = n_bytes / (1024 * 1024) |
|
n_bytes_in_gb = n_bytes_in_mb / 1024 |
|
n_bytes_in_tb = n_bytes_in_gb / 1024 |
|
|
|
|
|
if unit == "n_tokens/n_bytes": |
|
value = n_tokens / n_bytes |
|
elif unit == "n_chars/n_tokens": |
|
value = n_chars / n_tokens |
|
elif unit == "n_tokens/n_chars": |
|
value = n_tokens / n_chars |
|
elif unit == "g_bytes/b_tokens": |
|
value = n_bytes_in_gb / n_tokens_in_billion |
|
elif unit == "t_bytes/t_tokens": |
|
value = n_bytes_in_tb / n_tokens_in_trillion |
|
elif unit == "b_tokens/g_bytes": |
|
value = n_tokens_in_billion / n_bytes_in_gb |
|
else: |
|
raise "measure not support" |
|
return round(value, 2) |
|
|
|
|
|
all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"] |
|
|
|
|
|
def pprint(stats): |
|
table = [] |
|
for tokenizer_name, stat in stats.items(): |
|
columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]} |
|
for unit in all_units: |
|
if unit not in stat: |
|
columns[unit] = unit_convertor(stat, unit) |
|
else: |
|
pass |
|
|
|
table.append(columns) |
|
df = pd.DataFrame(table) |
|
|
|
logger.info(df.to_markdown(index=False)) |
|
return |
|
|
|
|
|
cache = {} |
|
|
|
|
|
def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): |
|
""" |
|
这个要独立的cache,因为速度慢。 |
|
:param tokenizer: |
|
:param lang: |
|
:param cache_dir: |
|
:return: |
|
""" |
|
|
|
def _tokenize(tokenizer, dataset): |
|
n_tokens = 0 |
|
n_chars = 0 |
|
n_bytes = 0 |
|
for item in dataset: |
|
text = item["text"] |
|
n_bytes += get_n_bytes_of_string(text) |
|
n_chars += len(text) |
|
encodings = tokenizer.encode(text) |
|
n_tokens += len(encodings) |
|
stat = { |
|
"vocab_size": tokenizer.vocab_size, |
|
"n_bytes": n_bytes, |
|
"n_tokens": n_tokens, |
|
"n_chars": n_chars, |
|
} |
|
return stat |
|
|
|
tokenizer_name = tokenizer.alias |
|
lang = lang.replace("cc100-", "") |
|
cache_id = f"{tokenizer_name}.{lang}" |
|
|
|
if cache_id in cache: |
|
logger.info(f"loading {cache_id} from in-memory cache") |
|
return cache[cache_id] |
|
|
|
|
|
cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}") |
|
os.makedirs(cache_dir, exist_ok=True) |
|
cache_path = os.path.join(cache_dir, f"{cache_id}.json") |
|
if os.path.exists(cache_path): |
|
logger.info(f"loading {cache_id} from file cache") |
|
stat = json.load(open(cache_path, "r", encoding="utf-8")) |
|
cache[cache_id] = stat |
|
return stat |
|
|
|
|
|
dataset = load_dataset("eson/cc100-samples", lang, split="train") |
|
stat = _tokenize(tokenizer, dataset) |
|
logger.info(f"saving {cache_id} to {cache_path}") |
|
json.dump(stat, open(cache_path, "w", encoding="utf-8")) |
|
logger.info(f"saving {cache_id} to in-memory cache") |
|
cache[cache_id] = stat |
|
return stat |
|
|
|
|
|
def main(): |
|
from vocab import all_tokenizers |
|
stats = {} |
|
for lang in ["en", "zh-Hans"]: |
|
print("###" * 10 + lang) |
|
|
|
for tokenizer_name in ['llama', 'llama2', 'llama3']: |
|
|
|
tokenizer = load_tokener(tokenizer_name) |
|
stat = tokenize_corpus(tokenizer, lang) |
|
|
|
stats[tokenizer_name] = stat |
|
|
|
pprint(stats) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|