fix: remove `print` debug statements
Browse files
tokenization_arcade100k.py
CHANGED
@@ -78,10 +78,6 @@ def _arcade100k(vocab_file: str):
|
|
78 |
for i, t in enumerate(SPECIAL_TOKENS_NAMES)
|
79 |
}
|
80 |
|
81 |
-
print(len(mergeable_ranks))
|
82 |
-
print(len(SPECIAL_TOKENS))
|
83 |
-
print(len(mergeable_ranks) + len(SPECIAL_TOKENS))
|
84 |
-
|
85 |
return {
|
86 |
"name": NAME,
|
87 |
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
|
|
|
78 |
for i, t in enumerate(SPECIAL_TOKENS_NAMES)
|
79 |
}
|
80 |
|
|
|
|
|
|
|
|
|
81 |
return {
|
82 |
"name": NAME,
|
83 |
"pat_str": r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
|