Spaces:
Sleeping
Sleeping
Upload 16 files
Browse files- minbpe/__init__.py +3 -0
- minbpe/__pycache__/__init__.cpython-310.pyc +0 -0
- minbpe/__pycache__/__init__.cpython-312.pyc +0 -0
- minbpe/__pycache__/base.cpython-310.pyc +0 -0
- minbpe/__pycache__/base.cpython-312.pyc +0 -0
- minbpe/__pycache__/basic.cpython-310.pyc +0 -0
- minbpe/__pycache__/basic.cpython-312.pyc +0 -0
- minbpe/__pycache__/regex.cpython-310.pyc +0 -0
- minbpe/__pycache__/regex.cpython-312.pyc +0 -0
- minbpe/base.py +169 -0
- minbpe/basic.py +75 -0
- minbpe/regex.py +166 -0
- models/basic.model +3 -0
- models/basic.vocab +0 -0
- models/regex.model +3 -0
- models/regex.vocab +512 -0
minbpe/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .base import Tokenizer
|
2 |
+
from .basic import BasicTokenizer
|
3 |
+
from .regex import RegexTokenizer
|
minbpe/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (296 Bytes). View file
|
|
minbpe/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (305 Bytes). View file
|
|
minbpe/__pycache__/base.cpython-310.pyc
ADDED
Binary file (5.2 kB). View file
|
|
minbpe/__pycache__/base.cpython-312.pyc
ADDED
Binary file (7.67 kB). View file
|
|
minbpe/__pycache__/basic.cpython-310.pyc
ADDED
Binary file (2.6 kB). View file
|
|
minbpe/__pycache__/basic.cpython-312.pyc
ADDED
Binary file (3.43 kB). View file
|
|
minbpe/__pycache__/regex.cpython-310.pyc
ADDED
Binary file (5.7 kB). View file
|
|
minbpe/__pycache__/regex.cpython-312.pyc
ADDED
Binary file (7.65 kB). View file
|
|
minbpe/base.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Contains the base Tokenizer class and a few common helper functions.
|
3 |
+
The base class also contains the (common) save/load functionality.
|
4 |
+
It would be possible to be a lot more strict about the interface and
|
5 |
+
e.g. isolating all regex/pattern parts to the RegexTokenizer, but
|
6 |
+
some concessions are made for simplicity.
|
7 |
+
"""
|
8 |
+
import unicodedata
|
9 |
+
|
10 |
+
# -----------------------------------------------------------------------------
|
11 |
+
# a few helper functions useful for both BasicTokenizer and RegexTokenizer
|
12 |
+
|
13 |
+
def get_stats(ids, counts=None):
|
14 |
+
"""
|
15 |
+
Given a list of integers, return a dictionary of counts of consecutive pairs
|
16 |
+
Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
|
17 |
+
Optionally allows to update an existing dictionary of counts
|
18 |
+
"""
|
19 |
+
counts = {} if counts is None else counts
|
20 |
+
for pair in zip(ids, ids[1:]): # iterate consecutive elements
|
21 |
+
counts[pair] = counts.get(pair, 0) + 1
|
22 |
+
return counts
|
23 |
+
|
24 |
+
|
25 |
+
def merge(ids, pair, idx):
|
26 |
+
"""
|
27 |
+
In the list of integers (ids), replace all consecutive occurrences
|
28 |
+
of pair with the new integer token idx
|
29 |
+
Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
|
30 |
+
"""
|
31 |
+
newids = []
|
32 |
+
i = 0
|
33 |
+
while i < len(ids):
|
34 |
+
# if not at the very last position AND the pair matches, replace it
|
35 |
+
if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
|
36 |
+
newids.append(idx)
|
37 |
+
i += 2
|
38 |
+
else:
|
39 |
+
newids.append(ids[i])
|
40 |
+
i += 1
|
41 |
+
return newids
|
42 |
+
|
43 |
+
def get_compression_ratio(text, tokenizer):
|
44 |
+
tokens = tokenizer.encode(text)
|
45 |
+
return len(tokens) / len(text)
|
46 |
+
|
47 |
+
# first two helper functions...
|
48 |
+
def replace_control_characters(s: str) -> str:
|
49 |
+
# we don't want to print control characters
|
50 |
+
# which distort the output (e.g. \n or much worse)
|
51 |
+
# https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
|
52 |
+
# http://www.unicode.org/reports/tr44/#GC_Values_Table
|
53 |
+
chars = []
|
54 |
+
for ch in s:
|
55 |
+
if unicodedata.category(ch)[0] != "C":
|
56 |
+
chars.append(ch) # this character is ok
|
57 |
+
else:
|
58 |
+
chars.append(f"\\u{ord(ch):04x}") # escape
|
59 |
+
return "".join(chars)
|
60 |
+
|
61 |
+
def render_token(t: bytes) -> str:
|
62 |
+
# pretty print a token, escaping control characters
|
63 |
+
s = t.decode('utf-8', errors='replace')
|
64 |
+
s = replace_control_characters(s)
|
65 |
+
return s
|
66 |
+
|
67 |
+
# -----------------------------------------------------------------------------
|
68 |
+
# the base Tokenizer class
|
69 |
+
|
70 |
+
class Tokenizer:
|
71 |
+
"""Base class for Tokenizers"""
|
72 |
+
|
73 |
+
def __init__(self):
|
74 |
+
# default: vocab size of 256 (all bytes), no merges, no patterns
|
75 |
+
self.merges = {} # (int, int) -> int
|
76 |
+
self.pattern = "" # str
|
77 |
+
self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
|
78 |
+
self.vocab = self._build_vocab() # int -> bytes
|
79 |
+
|
80 |
+
def train(self, text, vocab_size, verbose=False):
|
81 |
+
# Tokenizer can train a vocabulary of size vocab_size from text
|
82 |
+
raise NotImplementedError
|
83 |
+
|
84 |
+
def encode(self, text):
|
85 |
+
# Tokenizer can encode a string into a list of integers
|
86 |
+
raise NotImplementedError
|
87 |
+
|
88 |
+
def decode(self, ids):
|
89 |
+
# Tokenizer can decode a list of integers into a string
|
90 |
+
raise NotImplementedError
|
91 |
+
|
92 |
+
def _build_vocab(self):
|
93 |
+
# vocab is simply and deterministically derived from merges
|
94 |
+
vocab = {idx: bytes([idx]) for idx in range(256)}
|
95 |
+
for (p0, p1), idx in self.merges.items():
|
96 |
+
vocab[idx] = vocab[p0] + vocab[p1]
|
97 |
+
for special, idx in self.special_tokens.items():
|
98 |
+
vocab[idx] = special.encode("utf-8")
|
99 |
+
return vocab
|
100 |
+
|
101 |
+
def save(self, file_prefix):
|
102 |
+
"""
|
103 |
+
Saves two files: file_prefix.vocab and file_prefix.model
|
104 |
+
This is inspired (but not equivalent to!) sentencepiece's model saving:
|
105 |
+
- model file is the critical one, intended for load()
|
106 |
+
- vocab file is just a pretty printed version for human inspection only
|
107 |
+
"""
|
108 |
+
# write the model: to be used in load() later
|
109 |
+
model_file = file_prefix + ".model"
|
110 |
+
with open(model_file, 'w') as f:
|
111 |
+
# write the version, pattern and merges, that's all that's needed
|
112 |
+
f.write("minbpe v1\n")
|
113 |
+
f.write(f"{self.pattern}\n")
|
114 |
+
# write the special tokens, first the number of them, then each one
|
115 |
+
f.write(f"{len(self.special_tokens)}\n")
|
116 |
+
for special, idx in self.special_tokens.items():
|
117 |
+
f.write(f"{special} {idx}\n")
|
118 |
+
# the merges dict
|
119 |
+
for idx1, idx2 in self.merges:
|
120 |
+
f.write(f"{idx1} {idx2}\n")
|
121 |
+
# write the vocab: for the human to look at
|
122 |
+
vocab_file = file_prefix + ".vocab"
|
123 |
+
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
|
124 |
+
with open(vocab_file, "w", encoding="utf-8") as f:
|
125 |
+
for idx, token in self.vocab.items():
|
126 |
+
# note: many tokens may be partial utf-8 sequences
|
127 |
+
# and cannot be decoded into valid strings. Here we're using
|
128 |
+
# errors='replace' to replace them with the replacement char �.
|
129 |
+
# this also means that we couldn't possibly use .vocab in load()
|
130 |
+
# because decoding in this way is a lossy operation!
|
131 |
+
s = render_token(token)
|
132 |
+
# find the children of this token, if any
|
133 |
+
if idx in inverted_merges:
|
134 |
+
# if this token has children, render it nicely as a merge
|
135 |
+
idx0, idx1 = inverted_merges[idx]
|
136 |
+
s0 = render_token(self.vocab[idx0])
|
137 |
+
s1 = render_token(self.vocab[idx1])
|
138 |
+
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
|
139 |
+
else:
|
140 |
+
# otherwise this is leaf token, just print it
|
141 |
+
# (this should just be the first 256 tokens, the bytes)
|
142 |
+
f.write(f"[{s}] {idx}\n")
|
143 |
+
|
144 |
+
def load(self, model_file):
|
145 |
+
"""Inverse of save() but only for the model file"""
|
146 |
+
assert model_file.endswith(".model")
|
147 |
+
# read the model file
|
148 |
+
merges = {}
|
149 |
+
special_tokens = {}
|
150 |
+
idx = 256
|
151 |
+
with open(model_file, 'r', encoding="utf-8") as f:
|
152 |
+
# read the version
|
153 |
+
version = f.readline().strip()
|
154 |
+
assert version == "minbpe v1"
|
155 |
+
# read the pattern
|
156 |
+
self.pattern = f.readline().strip()
|
157 |
+
# read the special tokens
|
158 |
+
num_special = int(f.readline().strip())
|
159 |
+
for _ in range(num_special):
|
160 |
+
special, special_idx = f.readline().strip().split()
|
161 |
+
special_tokens[special] = int(special_idx)
|
162 |
+
# read the merges
|
163 |
+
for line in f:
|
164 |
+
idx1, idx2 = map(int, line.split())
|
165 |
+
merges[(idx1, idx2)] = idx
|
166 |
+
idx += 1
|
167 |
+
self.merges = merges
|
168 |
+
self.special_tokens = special_tokens
|
169 |
+
self.vocab = self._build_vocab()
|
minbpe/basic.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Minimal (byte-level) Byte Pair Encoding tokenizer.
|
3 |
+
|
4 |
+
Algorithmically follows along the GPT tokenizer:
|
5 |
+
https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
6 |
+
|
7 |
+
But:
|
8 |
+
- Does not handle the regular expression splitting pattern.
|
9 |
+
- Does not handle any special tokens.
|
10 |
+
"""
|
11 |
+
|
12 |
+
from .base import Tokenizer, get_stats, merge, get_compression_ratio
|
13 |
+
|
14 |
+
|
15 |
+
class BasicTokenizer(Tokenizer):
|
16 |
+
|
17 |
+
def __init__(self):
|
18 |
+
super().__init__()
|
19 |
+
|
20 |
+
def train(self, text, vocab_size, verbose=False):
|
21 |
+
assert vocab_size >= 256
|
22 |
+
num_merges = vocab_size - 256
|
23 |
+
tokens = text.encode("utf-8") # raw bytes
|
24 |
+
# input text preprocessing
|
25 |
+
text_bytes = text.encode("utf-8") # raw bytes
|
26 |
+
ids = list(text_bytes) # list of integers in range 0..255
|
27 |
+
|
28 |
+
# iteratively merge the most common pairs to create new tokens
|
29 |
+
merges = {} # (int, int) -> int
|
30 |
+
vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
|
31 |
+
for i in range(num_merges):
|
32 |
+
# count up the number of times every consecutive pair appears
|
33 |
+
stats = get_stats(ids)
|
34 |
+
# find the pair with the highest count
|
35 |
+
pair = max(stats, key=stats.get)
|
36 |
+
# mint a new token: assign it the next available id
|
37 |
+
idx = 256 + i
|
38 |
+
# replace all occurrences of pair in ids with idx
|
39 |
+
ids = merge(ids, pair, idx)
|
40 |
+
# save the merge
|
41 |
+
merges[pair] = idx
|
42 |
+
vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
|
43 |
+
# prints
|
44 |
+
if verbose:
|
45 |
+
print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
|
46 |
+
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
|
47 |
+
|
48 |
+
# save class variables
|
49 |
+
self.merges = merges # used in encode()
|
50 |
+
self.vocab = vocab # used in decode()
|
51 |
+
|
52 |
+
def decode(self, ids):
|
53 |
+
# given ids (list of integers), return Python string
|
54 |
+
text_bytes = b"".join(self.vocab[idx] for idx in ids)
|
55 |
+
text = text_bytes.decode("utf-8", errors="replace")
|
56 |
+
return text
|
57 |
+
|
58 |
+
def encode(self, text):
|
59 |
+
# given a string text, return the token ids
|
60 |
+
text_bytes = text.encode("utf-8") # raw bytes
|
61 |
+
ids = list(text_bytes) # list of integers in range 0..255
|
62 |
+
while len(ids) >= 2:
|
63 |
+
# find the pair with the lowest merge index
|
64 |
+
stats = get_stats(ids)
|
65 |
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
66 |
+
# subtle: if there are no more merges available, the key will
|
67 |
+
# result in an inf for every single pair, and the min will be
|
68 |
+
# just the first pair in the list, arbitrarily
|
69 |
+
# we can detect this terminating case by a membership check
|
70 |
+
if pair not in self.merges:
|
71 |
+
break # nothing else can be merged anymore
|
72 |
+
# otherwise let's merge the best pair (lowest merge index)
|
73 |
+
idx = self.merges[pair]
|
74 |
+
ids = merge(ids, pair, idx)
|
75 |
+
return ids
|
minbpe/regex.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Minimal (byte-level) Byte Pair Encoding tokenizer.
|
3 |
+
|
4 |
+
Algorithmically follows along the GPT tokenizer:
|
5 |
+
https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
6 |
+
|
7 |
+
Unlike BasicTokenizer:
|
8 |
+
- RegexTokenizer handles an optional regex splitting pattern.
|
9 |
+
- RegexTokenizer handles optional special tokens.
|
10 |
+
"""
|
11 |
+
|
12 |
+
import regex as re
|
13 |
+
from .base import Tokenizer, get_stats, merge
|
14 |
+
|
15 |
+
|
16 |
+
# the main GPT text split patterns, see
|
17 |
+
# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
|
18 |
+
GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
19 |
+
GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
20 |
+
|
21 |
+
|
22 |
+
class RegexTokenizer(Tokenizer):
|
23 |
+
|
24 |
+
def __init__(self, pattern=None):
|
25 |
+
"""
|
26 |
+
- pattern: optional string to override the default (GPT-4 split pattern)
|
27 |
+
- special_tokens: str -> int dictionary of special tokens
|
28 |
+
example: {'<|endoftext|>': 100257}
|
29 |
+
"""
|
30 |
+
super().__init__()
|
31 |
+
self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
|
32 |
+
self.compiled_pattern = re.compile(self.pattern)
|
33 |
+
self.special_tokens = {}
|
34 |
+
self.inverse_special_tokens = {}
|
35 |
+
|
36 |
+
def train(self, text, vocab_size, verbose=False):
|
37 |
+
assert vocab_size >= 256
|
38 |
+
num_merges = vocab_size - 256
|
39 |
+
tokens = text.encode("utf-8")
|
40 |
+
|
41 |
+
# split the text up into text chunks
|
42 |
+
text_chunks = re.findall(self.compiled_pattern, text)
|
43 |
+
|
44 |
+
# input text preprocessing
|
45 |
+
ids = [list(ch.encode("utf-8")) for ch in text_chunks]
|
46 |
+
|
47 |
+
# iteratively merge the most common pairs to create new tokens
|
48 |
+
merges = {} # (int, int) -> int
|
49 |
+
vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
|
50 |
+
for i in range(num_merges):
|
51 |
+
# count the number of times every consecutive pair appears
|
52 |
+
stats = {}
|
53 |
+
for chunk_ids in ids:
|
54 |
+
# passing in stats will update it in place, adding up counts
|
55 |
+
get_stats(chunk_ids, stats)
|
56 |
+
# find the pair with the highest count
|
57 |
+
pair = max(stats, key=stats.get)
|
58 |
+
# mint a new token: assign it the next available id
|
59 |
+
idx = 256 + i
|
60 |
+
# replace all occurrences of pair in ids with idx
|
61 |
+
ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
|
62 |
+
# save the merge
|
63 |
+
merges[pair] = idx
|
64 |
+
vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
|
65 |
+
# prints
|
66 |
+
if verbose:
|
67 |
+
print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
|
68 |
+
print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
|
69 |
+
|
70 |
+
# save class variables
|
71 |
+
self.merges = merges # used in encode()
|
72 |
+
self.vocab = vocab # used in decode()
|
73 |
+
|
74 |
+
def register_special_tokens(self, special_tokens):
|
75 |
+
# special_tokens is a dictionary of str -> int
|
76 |
+
# example: {"<|endoftext|>": 100257}
|
77 |
+
self.special_tokens = special_tokens
|
78 |
+
self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
|
79 |
+
|
80 |
+
def decode(self, ids):
|
81 |
+
# given ids (list of integers), return Python string
|
82 |
+
part_bytes = []
|
83 |
+
for idx in ids:
|
84 |
+
if idx in self.vocab:
|
85 |
+
part_bytes.append(self.vocab[idx])
|
86 |
+
elif idx in self.inverse_special_tokens:
|
87 |
+
part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
|
88 |
+
else:
|
89 |
+
raise ValueError(f"invalid token id: {idx}")
|
90 |
+
text_bytes = b"".join(part_bytes)
|
91 |
+
text = text_bytes.decode("utf-8", errors="replace")
|
92 |
+
return text
|
93 |
+
|
94 |
+
def _encode_chunk(self, text_bytes):
|
95 |
+
# return the token ids
|
96 |
+
# let's begin. first, convert all bytes to integers in range 0..255
|
97 |
+
ids = list(text_bytes)
|
98 |
+
while len(ids) >= 2:
|
99 |
+
# find the pair with the lowest merge index
|
100 |
+
stats = get_stats(ids)
|
101 |
+
pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
|
102 |
+
# subtle: if there are no more merges available, the key will
|
103 |
+
# result in an inf for every single pair, and the min will be
|
104 |
+
# just the first pair in the list, arbitrarily
|
105 |
+
# we can detect this terminating case by a membership check
|
106 |
+
if pair not in self.merges:
|
107 |
+
break # nothing else can be merged anymore
|
108 |
+
# otherwise let's merge the best pair (lowest merge index)
|
109 |
+
idx = self.merges[pair]
|
110 |
+
ids = merge(ids, pair, idx)
|
111 |
+
return ids
|
112 |
+
|
113 |
+
def encode_ordinary(self, text):
|
114 |
+
"""Encoding that ignores any special tokens."""
|
115 |
+
# split text into chunks of text by categories defined in regex pattern
|
116 |
+
text_chunks = re.findall(self.compiled_pattern, text)
|
117 |
+
# all chunks of text are encoded separately, then results are joined
|
118 |
+
ids = []
|
119 |
+
for chunk in text_chunks:
|
120 |
+
chunk_bytes = chunk.encode("utf-8") # raw bytes
|
121 |
+
chunk_ids = self._encode_chunk(chunk_bytes)
|
122 |
+
ids.extend(chunk_ids)
|
123 |
+
return ids
|
124 |
+
|
125 |
+
def encode(self, text, allowed_special="none_raise"):
|
126 |
+
"""
|
127 |
+
Unlike encode_ordinary, this function handles special tokens.
|
128 |
+
allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
|
129 |
+
if none_raise, then an error is raised if any special token is encountered in text
|
130 |
+
this is the default tiktoken behavior right now as well
|
131 |
+
any other behavior is either annoying, or a major footgun
|
132 |
+
"""
|
133 |
+
# decode the user desire w.r.t. handling of special tokens
|
134 |
+
special = None
|
135 |
+
if allowed_special == "all":
|
136 |
+
special = self.special_tokens
|
137 |
+
elif allowed_special == "none":
|
138 |
+
special = {}
|
139 |
+
elif allowed_special == "none_raise":
|
140 |
+
special = {}
|
141 |
+
assert all(token not in text for token in self.special_tokens)
|
142 |
+
elif isinstance(allowed_special, set):
|
143 |
+
special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
|
144 |
+
else:
|
145 |
+
raise ValueError(f"allowed_special={allowed_special} not understood")
|
146 |
+
if not special:
|
147 |
+
# shortcut: if no special tokens, just use the ordinary encoding
|
148 |
+
return self.encode_ordinary(text)
|
149 |
+
# otherwise, we have to be careful with potential special tokens in text
|
150 |
+
# we handle special tokens by splitting the text
|
151 |
+
# based on the occurrence of any exact match with any of the special tokens
|
152 |
+
# we can use re.split for this. note that surrounding the pattern with ()
|
153 |
+
# makes it into a capturing group, so the special tokens will be included
|
154 |
+
special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
|
155 |
+
special_chunks = re.split(special_pattern, text)
|
156 |
+
# now all the special characters are separated from the rest of the text
|
157 |
+
# all chunks of text are encoded separately, then results are joined
|
158 |
+
ids = []
|
159 |
+
for part in special_chunks:
|
160 |
+
if part in special:
|
161 |
+
# this is a special token, encode it separately as a special case
|
162 |
+
ids.append(special[part])
|
163 |
+
else:
|
164 |
+
# this is an ordinary sequence, encode it normally
|
165 |
+
ids.extend(self.encode_ordinary(part))
|
166 |
+
return ids
|
models/basic.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:193b5a8dc3085e8a380536c20e22960d73f9457fb1b7e41f4ccd51d2edc88f20
|
3 |
+
size 39636
|
models/basic.vocab
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/regex.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53d1f1770d291d75edddc74c10f51140e469b2961aeb4089a6a888e5ec13b6f2
|
3 |
+
size 2155
|
models/regex.vocab
ADDED
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[\u0000] 0
|
2 |
+
[\u0001] 1
|
3 |
+
[\u0002] 2
|
4 |
+
[\u0003] 3
|
5 |
+
[\u0004] 4
|
6 |
+
[\u0005] 5
|
7 |
+
[\u0006] 6
|
8 |
+
[\u0007] 7
|
9 |
+
[\u0008] 8
|
10 |
+
[\u0009] 9
|
11 |
+
[\u000a] 10
|
12 |
+
[\u000b] 11
|
13 |
+
[\u000c] 12
|
14 |
+
[\u000d] 13
|
15 |
+
[\u000e] 14
|
16 |
+
[\u000f] 15
|
17 |
+
[\u0010] 16
|
18 |
+
[\u0011] 17
|
19 |
+
[\u0012] 18
|
20 |
+
[\u0013] 19
|
21 |
+
[\u0014] 20
|
22 |
+
[\u0015] 21
|
23 |
+
[\u0016] 22
|
24 |
+
[\u0017] 23
|
25 |
+
[\u0018] 24
|
26 |
+
[\u0019] 25
|
27 |
+
[\u001a] 26
|
28 |
+
[\u001b] 27
|
29 |
+
[\u001c] 28
|
30 |
+
[\u001d] 29
|
31 |
+
[\u001e] 30
|
32 |
+
[\u001f] 31
|
33 |
+
[ ] 32
|
34 |
+
[!] 33
|
35 |
+
["] 34
|
36 |
+
[#] 35
|
37 |
+
[$] 36
|
38 |
+
[%] 37
|
39 |
+
[&] 38
|
40 |
+
['] 39
|
41 |
+
[(] 40
|
42 |
+
[)] 41
|
43 |
+
[*] 42
|
44 |
+
[+] 43
|
45 |
+
[,] 44
|
46 |
+
[-] 45
|
47 |
+
[.] 46
|
48 |
+
[/] 47
|
49 |
+
[0] 48
|
50 |
+
[1] 49
|
51 |
+
[2] 50
|
52 |
+
[3] 51
|
53 |
+
[4] 52
|
54 |
+
[5] 53
|
55 |
+
[6] 54
|
56 |
+
[7] 55
|
57 |
+
[8] 56
|
58 |
+
[9] 57
|
59 |
+
[:] 58
|
60 |
+
[;] 59
|
61 |
+
[<] 60
|
62 |
+
[=] 61
|
63 |
+
[>] 62
|
64 |
+
[?] 63
|
65 |
+
[@] 64
|
66 |
+
[A] 65
|
67 |
+
[B] 66
|
68 |
+
[C] 67
|
69 |
+
[D] 68
|
70 |
+
[E] 69
|
71 |
+
[F] 70
|
72 |
+
[G] 71
|
73 |
+
[H] 72
|
74 |
+
[I] 73
|
75 |
+
[J] 74
|
76 |
+
[K] 75
|
77 |
+
[L] 76
|
78 |
+
[M] 77
|
79 |
+
[N] 78
|
80 |
+
[O] 79
|
81 |
+
[P] 80
|
82 |
+
[Q] 81
|
83 |
+
[R] 82
|
84 |
+
[S] 83
|
85 |
+
[T] 84
|
86 |
+
[U] 85
|
87 |
+
[V] 86
|
88 |
+
[W] 87
|
89 |
+
[X] 88
|
90 |
+
[Y] 89
|
91 |
+
[Z] 90
|
92 |
+
[[] 91
|
93 |
+
[\] 92
|
94 |
+
[]] 93
|
95 |
+
[^] 94
|
96 |
+
[_] 95
|
97 |
+
[`] 96
|
98 |
+
[a] 97
|
99 |
+
[b] 98
|
100 |
+
[c] 99
|
101 |
+
[d] 100
|
102 |
+
[e] 101
|
103 |
+
[f] 102
|
104 |
+
[g] 103
|
105 |
+
[h] 104
|
106 |
+
[i] 105
|
107 |
+
[j] 106
|
108 |
+
[k] 107
|
109 |
+
[l] 108
|
110 |
+
[m] 109
|
111 |
+
[n] 110
|
112 |
+
[o] 111
|
113 |
+
[p] 112
|
114 |
+
[q] 113
|
115 |
+
[r] 114
|
116 |
+
[s] 115
|
117 |
+
[t] 116
|
118 |
+
[u] 117
|
119 |
+
[v] 118
|
120 |
+
[w] 119
|
121 |
+
[x] 120
|
122 |
+
[y] 121
|
123 |
+
[z] 122
|
124 |
+
[{] 123
|
125 |
+
[|] 124
|
126 |
+
[}] 125
|
127 |
+
[~] 126
|
128 |
+
[\u007f] 127
|
129 |
+
[�] 128
|
130 |
+
[�] 129
|
131 |
+
[�] 130
|
132 |
+
[�] 131
|
133 |
+
[�] 132
|
134 |
+
[�] 133
|
135 |
+
[�] 134
|
136 |
+
[�] 135
|
137 |
+
[�] 136
|
138 |
+
[�] 137
|
139 |
+
[�] 138
|
140 |
+
[�] 139
|
141 |
+
[�] 140
|
142 |
+
[�] 141
|
143 |
+
[�] 142
|
144 |
+
[�] 143
|
145 |
+
[�] 144
|
146 |
+
[�] 145
|
147 |
+
[�] 146
|
148 |
+
[�] 147
|
149 |
+
[�] 148
|
150 |
+
[�] 149
|
151 |
+
[�] 150
|
152 |
+
[�] 151
|
153 |
+
[�] 152
|
154 |
+
[�] 153
|
155 |
+
[�] 154
|
156 |
+
[�] 155
|
157 |
+
[�] 156
|
158 |
+
[�] 157
|
159 |
+
[�] 158
|
160 |
+
[�] 159
|
161 |
+
[�] 160
|
162 |
+
[�] 161
|
163 |
+
[�] 162
|
164 |
+
[�] 163
|
165 |
+
[�] 164
|
166 |
+
[�] 165
|
167 |
+
[�] 166
|
168 |
+
[�] 167
|
169 |
+
[�] 168
|
170 |
+
[�] 169
|
171 |
+
[�] 170
|
172 |
+
[�] 171
|
173 |
+
[�] 172
|
174 |
+
[�] 173
|
175 |
+
[�] 174
|
176 |
+
[�] 175
|
177 |
+
[�] 176
|
178 |
+
[�] 177
|
179 |
+
[�] 178
|
180 |
+
[�] 179
|
181 |
+
[�] 180
|
182 |
+
[�] 181
|
183 |
+
[�] 182
|
184 |
+
[�] 183
|
185 |
+
[�] 184
|
186 |
+
[�] 185
|
187 |
+
[�] 186
|
188 |
+
[�] 187
|
189 |
+
[�] 188
|
190 |
+
[�] 189
|
191 |
+
[�] 190
|
192 |
+
[�] 191
|
193 |
+
[�] 192
|
194 |
+
[�] 193
|
195 |
+
[�] 194
|
196 |
+
[�] 195
|
197 |
+
[�] 196
|
198 |
+
[�] 197
|
199 |
+
[�] 198
|
200 |
+
[�] 199
|
201 |
+
[�] 200
|
202 |
+
[�] 201
|
203 |
+
[�] 202
|
204 |
+
[�] 203
|
205 |
+
[�] 204
|
206 |
+
[�] 205
|
207 |
+
[�] 206
|
208 |
+
[�] 207
|
209 |
+
[�] 208
|
210 |
+
[�] 209
|
211 |
+
[�] 210
|
212 |
+
[�] 211
|
213 |
+
[�] 212
|
214 |
+
[�] 213
|
215 |
+
[�] 214
|
216 |
+
[�] 215
|
217 |
+
[�] 216
|
218 |
+
[�] 217
|
219 |
+
[�] 218
|
220 |
+
[�] 219
|
221 |
+
[�] 220
|
222 |
+
[�] 221
|
223 |
+
[�] 222
|
224 |
+
[�] 223
|
225 |
+
[�] 224
|
226 |
+
[�] 225
|
227 |
+
[�] 226
|
228 |
+
[�] 227
|
229 |
+
[�] 228
|
230 |
+
[�] 229
|
231 |
+
[�] 230
|
232 |
+
[�] 231
|
233 |
+
[�] 232
|
234 |
+
[�] 233
|
235 |
+
[�] 234
|
236 |
+
[�] 235
|
237 |
+
[�] 236
|
238 |
+
[�] 237
|
239 |
+
[�] 238
|
240 |
+
[�] 239
|
241 |
+
[�] 240
|
242 |
+
[�] 241
|
243 |
+
[�] 242
|
244 |
+
[�] 243
|
245 |
+
[�] 244
|
246 |
+
[�] 245
|
247 |
+
[�] 246
|
248 |
+
[�] 247
|
249 |
+
[�] 248
|
250 |
+
[�] 249
|
251 |
+
[�] 250
|
252 |
+
[�] 251
|
253 |
+
[�] 252
|
254 |
+
[�] 253
|
255 |
+
[�] 254
|
256 |
+
[�] 255
|
257 |
+
[�][�] -> [�] 256
|
258 |
+
[ ][�] -> [ �] 257
|
259 |
+
[�][�] -> [�] 258
|
260 |
+
[�][�] -> [ा] 259
|
261 |
+
[�][�] -> [े] 260
|
262 |
+
[�][�] -> [र] 261
|
263 |
+
[ �][�] -> [ क] 262
|
264 |
+
[�][�] -> [्] 263
|
265 |
+
[�][�] -> [न] 264
|
266 |
+
[�][�] -> [ि] 265
|
267 |
+
[�][�] -> [ो] 266
|
268 |
+
[्][�] -> [्�] 267
|
269 |
+
[�][�] -> [ं] 268
|
270 |
+
[ा][�] -> [ा�] 269
|
271 |
+
[�][�] -> [ी] 270
|
272 |
+
[�][�] -> [ु] 271
|
273 |
+
[�][�] -> [स] 272
|
274 |
+
[�][�] -> [ह] 273
|
275 |
+
[ �][�] -> [ ह] 274
|
276 |
+
[�][�] -> [क] 275
|
277 |
+
[�][�] -> [त] 276
|
278 |
+
[ �][�] -> [ प] 277
|
279 |
+
[ �][�] -> [ स] 278
|
280 |
+
[ �][�] -> [ म] 279
|
281 |
+
[�][�] -> [म] 280
|
282 |
+
[�][�] -> [ै] 281
|
283 |
+
[ि][�] -> [ि�] 282
|
284 |
+
[ �][�] -> [ उ] 283
|
285 |
+
[�][र] -> [�र] 284
|
286 |
+
[ �][�] -> [ ज] 285
|
287 |
+
[ �][�] -> [ त] 286
|
288 |
+
[�][�] -> [।] 287
|
289 |
+
[ �][�] -> [ न] 288
|
290 |
+
[ �][�र] -> [ और] 289
|
291 |
+
[े][ं] -> [ें] 290
|
292 |
+
[ो][ं] -> [ों] 291
|
293 |
+
[ �][�] -> [ व] 292
|
294 |
+
[ �][�] -> [ द] 293
|
295 |
+
[ु][�] -> [ु�] 294
|
296 |
+
[ा][र] -> [ार] 295
|
297 |
+
[ �][�] -> [ ब] 296
|
298 |
+
[।][\u000a] -> [।\u000a] 297
|
299 |
+
[्�][�] -> [्य] 298
|
300 |
+
[े][�] -> [े�] 299
|
301 |
+
[�][�] -> [ू] 300
|
302 |
+
[ उ][स] -> [ उस] 301
|
303 |
+
[्][र] -> [्र] 302
|
304 |
+
[�][�] -> [ग] 303
|
305 |
+
[�][�] -> [ल] 304
|
306 |
+
[�][�] -> [�] 305
|
307 |
+
[ �][�] -> [ ल] 306
|
308 |
+
[ �][�] -> [ अ] 307
|
309 |
+
[ा][,] -> [ा,] 308
|
310 |
+
[ प][र] -> [ पर] 309
|
311 |
+
[�][�] -> [प] 310
|
312 |
+
[ि�][�] -> [िय] 311
|
313 |
+
[ �][�] -> [ य] 312
|
314 |
+
[्�][�] -> [्व] 313
|
315 |
+
[�][�] -> [ब] 314
|
316 |
+
[ �][�] -> [ भ] 315
|
317 |
+
[्�][�] -> [्त] 316
|
318 |
+
[�][�] -> [य] 317
|
319 |
+
[ क][र] -> [ कर] 318
|
320 |
+
[ �][�] -> [ आ] 319
|
321 |
+
[ा][न] -> [ान] 320
|
322 |
+
[ै][ं] -> [ैं] 321
|
323 |
+
[�][�] -> [़] 322
|
324 |
+
[�][�] -> [व] 323
|
325 |
+
[ �][�] -> [ र] 324
|
326 |
+
[�][�] -> [द] 325
|
327 |
+
[ु][म] -> [ुम] 326
|
328 |
+
[ा�][�] -> [ात] 327
|
329 |
+
[ क][ह] -> [ कह] 328
|
330 |
+
[ �][�] -> [ ग] 329
|
331 |
+
[ �][�] -> [ च] 330
|
332 |
+
[�][�] -> [ँ] 331
|
333 |
+
[ उ][न] -> [ उन] 332
|
334 |
+
[ व][ह] -> [ वह] 333
|
335 |
+
[ �][�] -> [ थ] 334
|
336 |
+
[्�][�] -> [्ह] 335
|
337 |
+
[�][�] -> [ड] 336
|
338 |
+
[क][र] -> [कर] 337
|
339 |
+
[�][�] -> [“] 338
|
340 |
+
[ी][ं] -> [ीं] 339
|
341 |
+
[ा�][�] -> [ास] 340
|
342 |
+
[�][�] -> [च] 341
|
343 |
+
[�][�] -> [ज] 342
|
344 |
+
[ै][,] -> [ै,] 343
|
345 |
+
[�][�] -> [श] 344
|
346 |
+
[्व][र] -> [्वर] 345
|
347 |
+
[�][�] -> [”] 346
|
348 |
+
[ उस][क] -> [ उसक] 347
|
349 |
+
[प][न] -> [पन] 348
|
350 |
+
[े�][�] -> [ेश] 349
|
351 |
+
[ा�][�] -> [ाल] 350
|
352 |
+
[ ][“] -> [ “] 351
|
353 |
+
[े][र] -> [ेर] 352
|
354 |
+
[ अ][पन] -> [ अपन] 353
|
355 |
+
[ि][त] -> [ित] 354
|
356 |
+
[ न][ह] -> [ नह] 355
|
357 |
+
[े][,] -> [े,] 356
|
358 |
+
[�][�र] -> [और] 357
|
359 |
+
[ा][।\u000a] -> [ा।\u000a] 358
|
360 |
+
[�][स] -> [�स] 359
|
361 |
+
[ य][ह] -> [ यह] 360
|
362 |
+
[ि][स] -> [िस] 361
|
363 |
+
[ि][न] -> [िन] 362
|
364 |
+
[ि][र] -> [िर] 363
|
365 |
+
[ ह][म] -> [ हम] 364
|
366 |
+
[”][\u000a] -> [”\u000a] 365
|
367 |
+
[ू][ँ] -> [ूँ] 366
|
368 |
+
[ा�][�] -> [ाँ] 367
|
369 |
+
[ र][ह] -> [ रह] 368
|
370 |
+
[ा�][�] -> [ाम] 369
|
371 |
+
[ पर][म] -> [ परम] 370
|
372 |
+
[ा�][�] -> [ाए] 371
|
373 |
+
[ो][�] -> [ो�] 372
|
374 |
+
[ा�][�] -> [ाह] 373
|
375 |
+
[ �][�] -> [ ए] 374
|
376 |
+
[�][�] -> [भ] 375
|
377 |
+
[ो][ग] -> [ोग] 376
|
378 |
+
[ ए][क] -> [ एक] 377
|
379 |
+
[्�][�] -> [्म] 378
|
380 |
+
[।][”\u000a] -> [।”\u000a] 379
|
381 |
+
[ म][न] -> [ मन] 380
|
382 |
+
[�][�] -> [ट] 381
|
383 |
+
[ि][क] -> [िक] 382
|
384 |
+
[ स][म] -> [ सम] 383
|
385 |
+
[ �][�स] -> [ इस] 384
|
386 |
+
[ा�][�] -> [ाय] 385
|
387 |
+
[्�][�] -> [्थ] 386
|
388 |
+
[ा�][�] -> [ाथ] 387
|
389 |
+
[ु�][�] -> [ुझ] 388
|
390 |
+
[ �][�] -> [ फ] 389
|
391 |
+
[ �][�] -> [ ख] 390
|
392 |
+
[ी][श] -> [ीश] 391
|
393 |
+
[ �][�] -> [ श] 392
|
394 |
+
[�][�] -> [ख] 393
|
395 |
+
[े�][�] -> [ेव] 394
|
396 |
+
[ स][ब] -> [ सब] 395
|
397 |
+
[ कर][त] -> [ करत] 396
|
398 |
+
[ो][,] -> [ो,] 397
|
399 |
+
[ि�][�] -> [िए] 398
|
400 |
+
[ु][त] -> [ुत] 399
|
401 |
+
[ु][स] -> [ुस] 400
|
402 |
+
[�][�] -> [ध] 401
|
403 |
+
[र][न] -> [रन] 402
|
404 |
+
[े�][�] -> [ेख] 403
|
405 |
+
[ै][।\u000a] -> [ै।\u000a] 404
|
406 |
+
[ु�][�] -> [ुष] 405
|
407 |
+
[�][�] -> [ण] 406
|
408 |
+
[ा�][�] -> [ाक] 407
|
409 |
+
[�][�] -> [ठ] 408
|
410 |
+
[ु][न] -> [ुन] 409
|
411 |
+
[्�][�] -> [्द] 410
|
412 |
+
[�][�] -> [ौ] 411
|
413 |
+
[ ग][य] -> [ गय] 412
|
414 |
+
[ प][ह] -> [ पह] 413
|
415 |
+
[ी][,] -> [ी,] 414
|
416 |
+
[ि�][�] -> [िल] 415
|
417 |
+
[ि�][�] -> [िश] 416
|
418 |
+
[ अ][न] -> [ अन] 417
|
419 |
+
[�][�] -> [ए] 418
|
420 |
+
[े][।\u000a] -> [े।\u000a] 419
|
421 |
+
[ उस][न] -> [ उसन] 420
|
422 |
+
[ै][स] -> [ैस] 421
|
423 |
+
[ ल][ग] -> [ लग] 422
|
424 |
+
[ैं][,] -> [ैं,] 423
|
425 |
+
[ �][�] -> [ ध] 424
|
426 |
+
[़][ा] -> [़ा] 425
|
427 |
+
[्र][भ] -> [्रभ] 426
|
428 |
+
[ कर][न] -> [ करन] 427
|
429 |
+
[ उन][क] -> [ उनक] 428
|
430 |
+
[ा�][�] -> [ाज] 429
|
431 |
+
[ पर][न] -> [ परन] 430
|
432 |
+
[�][�] -> [ृ] 431
|
433 |
+
[ो�][�] -> [ोई] 432
|
434 |
+
[�][�] -> [उ] 433
|
435 |
+
[ �][�] -> [ छ] 434
|
436 |
+
[ा�][�] -> [ाई] 435
|
437 |
+
[ �][�] -> [ घ] 436
|
438 |
+
[ु�][�] -> [ुछ] 437
|
439 |
+
[ी][ह] -> [ीह] 438
|
440 |
+
[ू][र] -> [ूर] 439
|
441 |
+
[ु�][�] -> [ुआ] 440
|
442 |
+
[ ब][ह] -> [ बह] 441
|
443 |
+
[ म][स] -> [ मस] 442
|
444 |
+
[ र][ख] -> [ रख] 443
|
445 |
+
[्][न] -> [्न] 444
|
446 |
+
[ि�][�] -> [िष] 445
|
447 |
+
[ उस][स] -> [ उसस] 446
|
448 |
+
[ �][�] -> [ ड] 447
|
449 |
+
[ ज][ब] -> [ जब] 448
|
450 |
+
[ु�][�] -> [ुए] 449
|
451 |
+
[ूँ][,] -> [ूँ,] 450
|
452 |
+
[ा][;] -> [ा;] 451
|
453 |
+
[�][�स] -> [इस] 452
|
454 |
+
[ �][�] -> [ इ] 453
|
455 |
+
[ कह][त] -> [ कहत] 454
|
456 |
+
[ा�][�] -> [ाओ] 455
|
457 |
+
[ो][त] -> [ोत] 456
|
458 |
+
[ा�][�] -> [ाप] 457
|
459 |
+
[�][�] -> [ढ] 458
|
460 |
+
[्�][�] -> [्ग] 459
|
461 |
+
[ आ][त] -> [ आत] 460
|
462 |
+
[े�][�] -> [ेग] 461
|
463 |
+
[े][त] -> [ेत] 462
|
464 |
+
[ उ][त] -> [ उत] 463
|
465 |
+
[्�][�] -> [्ध] 464
|
466 |
+
[ �][�] -> [ ठ] 465
|
467 |
+
[ त][क] -> [ तक] 466
|
468 |
+
[ू][स] -> [ूस] 467
|
469 |
+
[ाक][र] -> [ाकर] 468
|
470 |
+
[ो][न] -> [ोन] 469
|
471 |
+
[ि�][�] -> [िख] 470
|
472 |
+
[े][म] -> [ेम] 471
|
473 |
+
[ान][त] -> [ानत] 472
|
474 |
+
[�][स] -> [�स] 473
|
475 |
+
[ु][र] -> [ुर] 474
|
476 |
+
[़][े] -> [़े] 475
|
477 |
+
[त][ब] -> [तब] 476
|
478 |
+
[ ब][न] -> [ बन] 477
|
479 |
+
[�][�] -> [ई] 478
|
480 |
+
[ �][�स] -> [ ऐस] 479
|
481 |
+
[ो][ड] -> [ोड] 480
|
482 |
+
[प][रन] -> [परन] 481
|
483 |
+
[ह][र] -> [हर] 482
|
484 |
+
[े�][�] -> [ेल] 483
|
485 |
+
[्�][�] -> [्ञ] 484
|
486 |
+
[उ][स] -> [उस] 485
|
487 |
+
[ा�][�] -> [ाद] 486
|
488 |
+
[ य][द] -> [ यद] 487
|
489 |
+
[ी][न] -> [ीन] 488
|
490 |
+
[ ग][ए] -> [ गए] 489
|
491 |
+
[ च][ल] -> [ चल] 490
|
492 |
+
[ उ][ठ] -> [ उठ] 491
|
493 |
+
[ प][त] -> [ पत] 492
|
494 |
+
[�][�] -> [छ] 493
|
495 |
+
[ ब][ड] -> [ बड] 494
|
496 |
+
[्�][�] -> [्ष] 495
|
497 |
+
[�][�] -> [अ] 496
|
498 |
+
[प][र] -> [पर] 497
|
499 |
+
[ उन][स] -> [ उनस] 498
|
500 |
+
[ीं][,] -> [ीं,] 499
|
501 |
+
[ो][।\u000a] -> [ो।\u000a] 500
|
502 |
+
[ �][�] -> [ ओ] 501
|
503 |
+
[ी][त] -> [ीत] 502
|
504 |
+
[ार][ण] -> [ारण] 503
|
505 |
+
[ प][व] -> [ पव] 504
|
506 |
+
[ु�][�] -> [ुँ] 505
|
507 |
+
[ों][,] -> [ों,] 506
|
508 |
+
[ै][;] -> [ै;] 507
|
509 |
+
[ अ][ध] -> [ अध] 508
|
510 |
+
[ स][क] -> [ सक] 509
|
511 |
+
[ आ][प] -> [ आप] 510
|
512 |
+
[ज][ब] -> [जब] 511
|