Spaces:

nishantb06
/

hindi-tokenizer-bpe-v2

Sleeping

App Files Files Community

nishantb06 commited on Jan 6

Commit

a7907a7

verified ·

1 Parent(s): 9781c4b

Upload 16 files

Browse files

Files changed (16) hide show

minbpe/__init__.py +3 -0
minbpe/__pycache__/__init__.cpython-310.pyc +0 -0
minbpe/__pycache__/__init__.cpython-312.pyc +0 -0
minbpe/__pycache__/base.cpython-310.pyc +0 -0
minbpe/__pycache__/base.cpython-312.pyc +0 -0
minbpe/__pycache__/basic.cpython-310.pyc +0 -0
minbpe/__pycache__/basic.cpython-312.pyc +0 -0
minbpe/__pycache__/regex.cpython-310.pyc +0 -0
minbpe/__pycache__/regex.cpython-312.pyc +0 -0
minbpe/base.py +169 -0
minbpe/basic.py +75 -0
minbpe/regex.py +166 -0
models/basic.model +3 -0
models/basic.vocab +0 -0
models/regex.model +3 -0
models/regex.vocab +512 -0

minbpe/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .base import Tokenizer
+from .basic import BasicTokenizer
+from .regex import RegexTokenizer

minbpe/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (296 Bytes). View file

minbpe/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (305 Bytes). View file

minbpe/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (5.2 kB). View file

minbpe/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (7.67 kB). View file

minbpe/__pycache__/basic.cpython-310.pyc ADDED Viewed

Binary file (2.6 kB). View file

minbpe/__pycache__/basic.cpython-312.pyc ADDED Viewed

Binary file (3.43 kB). View file

minbpe/__pycache__/regex.cpython-310.pyc ADDED Viewed

Binary file (5.7 kB). View file

minbpe/__pycache__/regex.cpython-312.pyc ADDED Viewed

Binary file (7.65 kB). View file

minbpe/base.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Contains the base Tokenizer class and a few common helper functions.
+The base class also contains the (common) save/load functionality.
+It would be possible to be a lot more strict about the interface and
+e.g. isolating all regex/pattern parts to the RegexTokenizer, but
+some concessions are made for simplicity.
+"""
+import unicodedata
+# -----------------------------------------------------------------------------
+# a few helper functions useful for both BasicTokenizer and RegexTokenizer
+def get_stats(ids, counts=None):
+    """
+    Given a list of integers, return a dictionary of counts of consecutive pairs
+    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    Optionally allows to update an existing dictionary of counts
+    """
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]): # iterate consecutive elements
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    newids = []
+    i = 0
+    while i < len(ids):
+        # if not at the very last position AND the pair matches, replace it
+        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+def get_compression_ratio(text, tokenizer):
+    tokens = tokenizer.encode(text)
+    return len(tokens) / len(text)
+# first two helper functions...
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+# -----------------------------------------------------------------------------
+# the base Tokenizer class
+class Tokenizer:
+    """Base class for Tokenizers"""
+    def __init__(self):
+        # default: vocab size of 256 (all bytes), no merges, no patterns
+        self.merges = {} # (int, int) -> int
+        self.pattern = "" # str
+        self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.vocab = self._build_vocab() # int -> bytes
+    def train(self, text, vocab_size, verbose=False):
+        # Tokenizer can train a vocabulary of size vocab_size from text
+        raise NotImplementedError
+    def encode(self, text):
+        # Tokenizer can encode a string into a list of integers
+        raise NotImplementedError
+    def decode(self, ids):
+        # Tokenizer can decode a list of integers into a string
+        raise NotImplementedError
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and merges, that's all that's needed
+            f.write("minbpe v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()

minbpe/basic.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Minimal (byte-level) Byte Pair Encoding tokenizer.
+Algorithmically follows along the GPT tokenizer:
+https://github.com/openai/gpt-2/blob/master/src/encoder.py
+But:
+- Does not handle the regular expression splitting pattern.
+- Does not handle any special tokens.
+"""
+from .base import Tokenizer, get_stats, merge, get_compression_ratio
+class BasicTokenizer(Tokenizer):
+    def __init__(self):
+        super().__init__()
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= 256
+        num_merges = vocab_size - 256
+        tokens = text.encode("utf-8") # raw bytes
+        # input text preprocessing
+        text_bytes = text.encode("utf-8") # raw bytes
+        ids = list(text_bytes) # list of integers in range 0..255
+        # iteratively merge the most common pairs to create new tokens
+        merges = {} # (int, int) -> int
+        vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
+        for i in range(num_merges):
+            # count up the number of times every consecutive pair appears
+            stats = get_stats(ids)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            # mint a new token: assign it the next available id
+            idx = 256 + i
+            # replace all occurrences of pair in ids with idx
+            ids = merge(ids, pair, idx)
+            # save the merge
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # prints
+            if verbose:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+                print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
+        # save class variables
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        text_bytes = b"".join(self.vocab[idx] for idx in ids)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def encode(self, text):
+        # given a string text, return the token ids
+        text_bytes = text.encode("utf-8") # raw bytes
+        ids = list(text_bytes) # list of integers in range 0..255
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids

minbpe/regex.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Minimal (byte-level) Byte Pair Encoding tokenizer.
+Algorithmically follows along the GPT tokenizer:
+https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Unlike BasicTokenizer:
+- RegexTokenizer handles an optional regex splitting pattern.
+- RegexTokenizer handles optional special tokens.
+"""
+import regex as re
+from .base import Tokenizer, get_stats, merge
+# the main GPT text split patterns, see
+# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
+GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+class RegexTokenizer(Tokenizer):
+    def __init__(self, pattern=None):
+        """
+        - pattern: optional string to override the default (GPT-4 split pattern)
+        - special_tokens: str -> int dictionary of special tokens
+          example: {'<|endoftext|>': 100257}
+        """
+        super().__init__()
+        self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
+        self.compiled_pattern = re.compile(self.pattern)
+        self.special_tokens = {}
+        self.inverse_special_tokens = {}
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= 256
+        num_merges = vocab_size - 256
+        tokens = text.encode("utf-8")
+        # split the text up into text chunks
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # input text preprocessing
+        ids = [list(ch.encode("utf-8")) for ch in text_chunks]
+        # iteratively merge the most common pairs to create new tokens
+        merges = {} # (int, int) -> int
+        vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
+        for i in range(num_merges):
+            # count the number of times every consecutive pair appears
+            stats = {}
+            for chunk_ids in ids:
+                # passing in stats will update it in place, adding up counts
+                get_stats(chunk_ids, stats)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            # mint a new token: assign it the next available id
+            idx = 256 + i
+            # replace all occurrences of pair in ids with idx
+            ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
+            # save the merge
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # prints
+            if verbose:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+                print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
+        # save class variables
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()
+    def register_special_tokens(self, special_tokens):
+        # special_tokens is a dictionary of str -> int
+        # example: {"<|endoftext|>": 100257}
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        part_bytes = []
+        for idx in ids:
+            if idx in self.vocab:
+                part_bytes.append(self.vocab[idx])
+            elif idx in self.inverse_special_tokens:
+                part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
+            else:
+                raise ValueError(f"invalid token id: {idx}")
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def _encode_chunk(self, text_bytes):
+        # return the token ids
+        # let's begin. first, convert all bytes to integers in range 0..255
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def encode_ordinary(self, text):
+        """Encoding that ignores any special tokens."""
+        # split text into chunks of text by categories defined in regex pattern
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for chunk in text_chunks:
+            chunk_bytes = chunk.encode("utf-8") # raw bytes
+            chunk_ids = self._encode_chunk(chunk_bytes)
+            ids.extend(chunk_ids)
+        return ids
+    def encode(self, text, allowed_special="none_raise"):
+        """
+        Unlike encode_ordinary, this function handles special tokens.
+        allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
+        if none_raise, then an error is raised if any special token is encountered in text
+        this is the default tiktoken behavior right now as well
+        any other behavior is either annoying, or a major footgun
+        """
+        # decode the user desire w.r.t. handling of special tokens
+        special = None
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens)
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood")
+        if not special:
+            # shortcut: if no special tokens, just use the ordinary encoding
+            return self.encode_ordinary(text)
+        # otherwise, we have to be careful with potential special tokens in text
+        # we handle special tokens by splitting the text
+        # based on the occurrence of any exact match with any of the special tokens
+        # we can use re.split for this. note that surrounding the pattern with ()
+        # makes it into a capturing group, so the special tokens will be included
+        special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
+        special_chunks = re.split(special_pattern, text)
+        # now all the special characters are separated from the rest of the text
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for part in special_chunks:
+            if part in special:
+                # this is a special token, encode it separately as a special case
+                ids.append(special[part])
+            else:
+                # this is an ordinary sequence, encode it normally
+                ids.extend(self.encode_ordinary(part))
+        return ids

models/basic.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:193b5a8dc3085e8a380536c20e22960d73f9457fb1b7e41f4ccd51d2edc88f20
+size 39636

models/basic.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

models/regex.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53d1f1770d291d75edddc74c10f51140e469b2961aeb4089a6a888e5ec13b6f2
+size 2155

models/regex.vocab ADDED Viewed

	@@ -0,0 +1,512 @@

+[\u0000] 0
+[\u0001] 1
+[\u0002] 2
+[\u0003] 3
+[\u0004] 4
+[\u0005] 5
+[\u0006] 6
+[\u0007] 7
+[\u0008] 8
+[\u0009] 9
+[\u000a] 10
+[\u000b] 11
+[\u000c] 12
+[\u000d] 13
+[\u000e] 14
+[\u000f] 15
+[\u0010] 16
+[\u0011] 17
+[\u0012] 18
+[\u0013] 19
+[\u0014] 20
+[\u0015] 21
+[\u0016] 22
+[\u0017] 23
+[\u0018] 24
+[\u0019] 25
+[\u001a] 26
+[\u001b] 27
+[\u001c] 28
+[\u001d] 29
+[\u001e] 30
+[\u001f] 31
+[ ] 32
+[!] 33
+["] 34
+[#] 35
+[$] 36
+[%] 37
+[&] 38
+['] 39
+[(] 40
+[)] 41
+[*] 42
+[+] 43
+[,] 44
+[-] 45
+[.] 46
+[/] 47
+[0] 48
+[1] 49
+[2] 50
+[3] 51
+[4] 52
+[5] 53
+[6] 54
+[7] 55
+[8] 56
+[9] 57
+[:] 58
+[;] 59
+[<] 60
+[=] 61
+[>] 62
+[?] 63
+[@] 64
+[A] 65
+[B] 66
+[C] 67
+[D] 68
+[E] 69
+[F] 70
+[G] 71
+[H] 72
+[I] 73
+[J] 74
+[K] 75
+[L] 76
+[M] 77
+[N] 78
+[O] 79
+[P] 80
+[Q] 81
+[R] 82
+[S] 83
+[T] 84
+[U] 85
+[V] 86
+[W] 87
+[X] 88
+[Y] 89
+[Z] 90
+[[] 91
+[\] 92
+[]] 93
+[^] 94
+[_] 95
+[`] 96
+[a] 97
+[b] 98
+[c] 99
+[d] 100
+[e] 101
+[f] 102
+[g] 103
+[h] 104
+[i] 105
+[j] 106
+[k] 107
+[l] 108
+[m] 109
+[n] 110
+[o] 111
+[p] 112
+[q] 113
+[r] 114
+[s] 115
+[t] 116
+[u] 117
+[v] 118
+[w] 119
+[x] 120
+[y] 121
+[z] 122
+[{] 123
+[|] 124
+[}] 125
+[~] 126
+[\u007f] 127
+[�] 128
+[�] 129
+[�] 130
+[�] 131
+[�] 132
+[�] 133
+[�] 134
+[�] 135
+[�] 136
+[�] 137
+[�] 138
+[�] 139
+[�] 140
+[�] 141
+[�] 142
+[�] 143
+[�] 144
+[�] 145
+[�] 146
+[�] 147
+[�] 148
+[�] 149
+[�] 150
+[�] 151
+[�] 152
+[�] 153
+[�] 154
+[�] 155
+[�] 156
+[�] 157
+[�] 158
+[�] 159
+[�] 160
+[�] 161
+[�] 162
+[�] 163
+[�] 164
+[�] 165
+[�] 166
+[�] 167
+[�] 168
+[�] 169
+[�] 170
+[�] 171
+[�] 172
+[�] 173
+[�] 174
+[�] 175
+[�] 176
+[�] 177
+[�] 178
+[�] 179
+[�] 180
+[�] 181
+[�] 182
+[�] 183
+[�] 184
+[�] 185
+[�] 186
+[�] 187
+[�] 188
+[�] 189
+[�] 190
+[�] 191
+[�] 192
+[�] 193
+[�] 194
+[�] 195
+[�] 196
+[�] 197
+[�] 198
+[�] 199
+[�] 200
+[�] 201
+[�] 202
+[�] 203
+[�] 204
+[�] 205
+[�] 206
+[�] 207
+[�] 208
+[�] 209
+[�] 210
+[�] 211
+[�] 212
+[�] 213
+[�] 214
+[�] 215
+[�] 216
+[�] 217
+[�] 218
+[�] 219
+[�] 220
+[�] 221
+[�] 222
+[�] 223
+[�] 224
+[�] 225
+[�] 226
+[�] 227
+[�] 228
+[�] 229
+[�] 230
+[�] 231
+[�] 232
+[�] 233
+[�] 234
+[�] 235
+[�] 236
+[�] 237
+[�] 238
+[�] 239
+[�] 240
+[�] 241
+[�] 242
+[�] 243
+[�] 244
+[�] 245
+[�] 246
+[�] 247
+[�] 248
+[�] 249
+[�] 250
+[�] 251
+[�] 252
+[�] 253
+[�] 254
+[�] 255
+[�][�] -> [�] 256
+[ ][�] -> [ �] 257
+[�][�] -> [�] 258
+[�][�] -> [ा] 259
+[�][�] -> [े] 260
+[�][�] -> [र] 261
+[ �][�] -> [ क] 262
+[�][�] -> [्] 263
+[�][�] -> [न] 264
+[�][�] -> [ि] 265
+[�][�] -> [ो] 266
+[्][�] -> [्�] 267
+[�][�] -> [ं] 268
+[ा][�] -> [ा�] 269
+[�][�] -> [ी] 270
+[�][�] -> [ु] 271
+[�][�] -> [स] 272
+[�][�] -> [ह] 273
+[ �][�] -> [ ह] 274
+[�][�] -> [क] 275
+[�][�] -> [त] 276
+[ �][�] -> [ प] 277
+[ �][�] -> [ स] 278
+[ �][�] -> [ म] 279
+[�][�] -> [म] 280
+[�][�] -> [ै] 281
+[ि][�] -> [ि�] 282
+[ �][�] -> [ उ] 283
+[�][र] -> [�र] 284
+[ �][�] -> [ ज] 285
+[ �][�] -> [ त] 286
+[�][�] -> [।] 287
+[ �][�] -> [ न] 288
+[ �][�र] -> [ और] 289
+[े][ं] -> [ें] 290
+[ो][ं] -> [ों] 291
+[ �][�] -> [ व] 292
+[ �][�] -> [ द] 293
+[ु][�] -> [ु�] 294
+[ा][र] -> [ार] 295
+[ �][�] -> [ ब] 296
+[।][\u000a] -> [।\u000a] 297
+[्�][�] -> [्य] 298
+[े][�] -> [े�] 299
+[�][�] -> [ू] 300
+[ उ][स] -> [ उस] 301
+[्][र] -> [्र] 302
+[�][�] -> [ग] 303
+[�][�] -> [ल] 304
+[�][�] -> [�] 305
+[ �][�] -> [ ल] 306
+[ �][�] -> [ अ] 307
+[ा][,] -> [ा,] 308
+[ प][र] -> [ पर] 309
+[�][�] -> [प] 310
+[ि�][�] -> [िय] 311
+[ �][�] -> [ य] 312
+[्�][�] -> [्व] 313
+[�][�] -> [ब] 314
+[ �][�] -> [ भ] 315
+[्�][�] -> [्त] 316
+[�][�] -> [य] 317
+[ क][र] -> [ कर] 318
+[ �][�] -> [ आ] 319
+[ा][न] -> [ान] 320
+[ै][ं] -> [ैं] 321
+[�][�] -> [़] 322
+[�][�] -> [व] 323
+[ �][�] -> [ र] 324
+[�][�] -> [द] 325
+[ु][म] -> [ुम] 326
+[ा�][�] -> [ात] 327
+[ क][ह] -> [ कह] 328
+[ �][�] -> [ ग] 329
+[ �][�] -> [ च] 330
+[�][�] -> [ँ] 331
+[ उ][न] -> [ उन] 332
+[ व][ह] -> [ वह] 333
+[ �][�] -> [ थ] 334
+[्�][�] -> [्ह] 335
+[�][�] -> [ड] 336
+[क][र] -> [कर] 337
+[�][�] -> [“] 338
+[ी][ं] -> [ीं] 339
+[ा�][�] -> [ास] 340
+[�][�] -> [च] 341
+[�][�] -> [ज] 342
+[ै][,] -> [ै,] 343
+[�][�] -> [श] 344
+[्व][र] -> [्वर] 345
+[�][�] -> [”] 346
+[ उस][क] -> [ उसक] 347
+[प][न] -> [पन] 348
+[े�][�] -> [ेश] 349
+[ा�][�] -> [ाल] 350
+[ ][“] -> [ “] 351
+[े][र] -> [ेर] 352
+[ अ][पन] -> [ अपन] 353
+[ि][त] -> [ित] 354
+[ न][ह] -> [ नह] 355
+[े][,] -> [े,] 356
+[�][�र] -> [और] 357
+[ा][।\u000a] -> [ा।\u000a] 358
+[�][स] -> [�स] 359
+[ य][ह] -> [ यह] 360
+[ि][स] -> [िस] 361
+[ि][न] -> [िन] 362
+[ि][र] -> [िर] 363
+[ ह][म] -> [ हम] 364
+[”][\u000a] -> [”\u000a] 365
+[ू][ँ] -> [ूँ] 366
+[ा�][�] -> [ाँ] 367
+[ र][ह] -> [ रह] 368
+[ा�][�] -> [ाम] 369
+[ पर][म] -> [ परम] 370
+[ा�][�] -> [ाए] 371
+[ो][�] -> [ो�] 372
+[ा�][�] -> [ाह] 373
+[ �][�] -> [ ए] 374
+[�][�] -> [भ] 375
+[ो][ग] -> [ोग] 376
+[ ए][क] -> [ एक] 377
+[्�][�] -> [्म] 378
+[।][”\u000a] -> [।”\u000a] 379
+[ म][न] -> [ मन] 380
+[�][�] -> [ट] 381
+[ि][क] -> [िक] 382
+[ स][म] -> [ सम] 383
+[ �][�स] -> [ इस] 384
+[ा�][�] -> [ाय] 385
+[्�][�] -> [्थ] 386
+[ा�][�] -> [ाथ] 387
+[ु�][�] -> [ुझ] 388
+[ �][�] -> [ फ] 389
+[ �][�] -> [ ख] 390
+[ी][श] -> [ीश] 391
+[ �][�] -> [ श] 392
+[�][�] -> [ख] 393
+[े�][�] -> [ेव] 394
+[ स][ब] -> [ सब] 395
+[ कर][त] -> [ करत] 396
+[ो][,] -> [ो,] 397
+[ि�][�] -> [िए] 398
+[ु][त] -> [ुत] 399
+[ु][स] -> [ुस] 400
+[�][�] -> [ध] 401
+[र][न] -> [रन] 402
+[े�][�] -> [ेख] 403
+[ै][।\u000a] -> [ै।\u000a] 404
+[ु�][�] -> [ुष] 405
+[�][�] -> [ण] 406
+[ा�][�] -> [ाक] 407
+[�][�] -> [ठ] 408
+[ु][न] -> [ुन] 409
+[्�][�] -> [्द] 410
+[�][�] -> [ौ] 411
+[ ग][य] -> [ गय] 412
+[ प][ह] -> [ पह] 413
+[ी][,] -> [ी,] 414
+[ि�][�] -> [िल] 415
+[ि�][�] -> [िश] 416
+[ अ][न] -> [ अन] 417
+[�][�] -> [ए] 418
+[े][।\u000a] -> [े।\u000a] 419
+[ उस][न] -> [ उसन] 420
+[ै][स] -> [ैस] 421
+[ ल][ग] -> [ लग] 422
+[ैं][,] -> [ैं,] 423
+[ �][�] -> [ ध] 424
+[़][ा] -> [़ा] 425
+[्र][भ] -> [्रभ] 426
+[ कर][न] -> [ करन] 427
+[ उन][क] -> [ उनक] 428
+[ा�][�] -> [ाज] 429
+[ पर][न] -> [ परन] 430
+[�][�] -> [ृ] 431
+[ो�][�] -> [ोई] 432
+[�][�] -> [उ] 433
+[ �][�] -> [ छ] 434
+[ा�][�] -> [ाई] 435
+[ �][�] -> [ घ] 436
+[ु�][�] -> [ुछ] 437
+[ी][ह] -> [ीह] 438
+[ू][र] -> [ूर] 439
+[ु�][�] -> [ुआ] 440
+[ ब][ह] -> [ बह] 441
+[ म][स] -> [ मस] 442
+[ र][ख] -> [ रख] 443
+[्][न] -> [्न] 444
+[ि�][�] -> [िष] 445
+[ उस][स] -> [ उसस] 446
+[ �][�] -> [ ड] 447
+[ ज][ब] -> [ जब] 448
+[ु�][�] -> [ुए] 449
+[ूँ][,] -> [ूँ,] 450
+[ा][;] -> [ा;] 451
+[�][�स] -> [इस] 452
+[ �][�] -> [ इ] 453
+[ कह][त] -> [ कहत] 454
+[ा�][�] -> [ाओ] 455
+[ो][त] -> [ोत] 456
+[ा�][�] -> [ाप] 457
+[�][�] -> [ढ] 458
+[्�][�] -> [्ग] 459
+[ आ][त] -> [ आत] 460
+[े�][�] -> [ेग] 461
+[े][त] -> [ेत] 462
+[ उ][त] -> [ उत] 463
+[्�][�] -> [्ध] 464
+[ �][�] -> [ ठ] 465
+[ त][क] -> [ तक] 466
+[ू][स] -> [ूस] 467
+[ाक][र] -> [ाकर] 468
+[ो][न] -> [ोन] 469
+[ि�][�] -> [िख] 470
+[े][म] -> [ेम] 471
+[ान][त] -> [ानत] 472
+[�][स] -> [�स] 473
+[ु][र] -> [ुर] 474
+[़][े] -> [़े] 475
+[त][ब] -> [तब] 476
+[ ब][न] -> [ बन] 477
+[�][�] -> [ई] 478
+[ �][�स] -> [ ऐस] 479
+[ो][ड] -> [ोड] 480
+[प][रन] -> [परन] 481
+[ह][र] -> [हर] 482
+[े�][�] -> [ेल] 483
+[्�][�] -> [्ञ] 484
+[उ][स] -> [उस] 485
+[ा�][�] -> [ाद] 486
+[ य][द] -> [ यद] 487
+[ी][न] -> [ीन] 488
+[ ग][ए] -> [ गए] 489
+[ च][ल] -> [ चल] 490
+[ उ][ठ] -> [ उठ] 491
+[ प][त] -> [ पत] 492
+[�][�] -> [छ] 493
+[ ब][ड] -> [ बड] 494
+[्�][�] -> [्ष] 495
+[�][�] -> [अ] 496
+[प][र] -> [पर] 497
+[ उन][स] -> [ उनस] 498
+[ीं][,] -> [ीं,] 499
+[ो][।\u000a] -> [ो।\u000a] 500
+[ �][�] -> [ ओ] 501
+[ी][त] -> [ीत] 502
+[ार][ण] -> [ारण] 503
+[ प][व] -> [ पव] 504
+[ु�][�] -> [ुँ] 505
+[ों][,] -> [ों,] 506
+[ै][;] -> [ै;] 507
+[ अ][ध] -> [ अध] 508
+[ स][क] -> [ सक] 509
+[ आ][प] -> [ आप] 510
+[ज][ब] -> [जब] 511