nishantb06 commited on
Commit
a7907a7
·
verified ·
1 Parent(s): 9781c4b

Upload 16 files

Browse files
minbpe/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .base import Tokenizer
2
+ from .basic import BasicTokenizer
3
+ from .regex import RegexTokenizer
minbpe/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (296 Bytes). View file
 
minbpe/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (305 Bytes). View file
 
minbpe/__pycache__/base.cpython-310.pyc ADDED
Binary file (5.2 kB). View file
 
minbpe/__pycache__/base.cpython-312.pyc ADDED
Binary file (7.67 kB). View file
 
minbpe/__pycache__/basic.cpython-310.pyc ADDED
Binary file (2.6 kB). View file
 
minbpe/__pycache__/basic.cpython-312.pyc ADDED
Binary file (3.43 kB). View file
 
minbpe/__pycache__/regex.cpython-310.pyc ADDED
Binary file (5.7 kB). View file
 
minbpe/__pycache__/regex.cpython-312.pyc ADDED
Binary file (7.65 kB). View file
 
minbpe/base.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Contains the base Tokenizer class and a few common helper functions.
3
+ The base class also contains the (common) save/load functionality.
4
+ It would be possible to be a lot more strict about the interface and
5
+ e.g. isolating all regex/pattern parts to the RegexTokenizer, but
6
+ some concessions are made for simplicity.
7
+ """
8
+ import unicodedata
9
+
10
+ # -----------------------------------------------------------------------------
11
+ # a few helper functions useful for both BasicTokenizer and RegexTokenizer
12
+
13
+ def get_stats(ids, counts=None):
14
+ """
15
+ Given a list of integers, return a dictionary of counts of consecutive pairs
16
+ Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
17
+ Optionally allows to update an existing dictionary of counts
18
+ """
19
+ counts = {} if counts is None else counts
20
+ for pair in zip(ids, ids[1:]): # iterate consecutive elements
21
+ counts[pair] = counts.get(pair, 0) + 1
22
+ return counts
23
+
24
+
25
+ def merge(ids, pair, idx):
26
+ """
27
+ In the list of integers (ids), replace all consecutive occurrences
28
+ of pair with the new integer token idx
29
+ Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
30
+ """
31
+ newids = []
32
+ i = 0
33
+ while i < len(ids):
34
+ # if not at the very last position AND the pair matches, replace it
35
+ if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
36
+ newids.append(idx)
37
+ i += 2
38
+ else:
39
+ newids.append(ids[i])
40
+ i += 1
41
+ return newids
42
+
43
+ def get_compression_ratio(text, tokenizer):
44
+ tokens = tokenizer.encode(text)
45
+ return len(tokens) / len(text)
46
+
47
+ # first two helper functions...
48
+ def replace_control_characters(s: str) -> str:
49
+ # we don't want to print control characters
50
+ # which distort the output (e.g. \n or much worse)
51
+ # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
52
+ # http://www.unicode.org/reports/tr44/#GC_Values_Table
53
+ chars = []
54
+ for ch in s:
55
+ if unicodedata.category(ch)[0] != "C":
56
+ chars.append(ch) # this character is ok
57
+ else:
58
+ chars.append(f"\\u{ord(ch):04x}") # escape
59
+ return "".join(chars)
60
+
61
+ def render_token(t: bytes) -> str:
62
+ # pretty print a token, escaping control characters
63
+ s = t.decode('utf-8', errors='replace')
64
+ s = replace_control_characters(s)
65
+ return s
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # the base Tokenizer class
69
+
70
+ class Tokenizer:
71
+ """Base class for Tokenizers"""
72
+
73
+ def __init__(self):
74
+ # default: vocab size of 256 (all bytes), no merges, no patterns
75
+ self.merges = {} # (int, int) -> int
76
+ self.pattern = "" # str
77
+ self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
78
+ self.vocab = self._build_vocab() # int -> bytes
79
+
80
+ def train(self, text, vocab_size, verbose=False):
81
+ # Tokenizer can train a vocabulary of size vocab_size from text
82
+ raise NotImplementedError
83
+
84
+ def encode(self, text):
85
+ # Tokenizer can encode a string into a list of integers
86
+ raise NotImplementedError
87
+
88
+ def decode(self, ids):
89
+ # Tokenizer can decode a list of integers into a string
90
+ raise NotImplementedError
91
+
92
+ def _build_vocab(self):
93
+ # vocab is simply and deterministically derived from merges
94
+ vocab = {idx: bytes([idx]) for idx in range(256)}
95
+ for (p0, p1), idx in self.merges.items():
96
+ vocab[idx] = vocab[p0] + vocab[p1]
97
+ for special, idx in self.special_tokens.items():
98
+ vocab[idx] = special.encode("utf-8")
99
+ return vocab
100
+
101
+ def save(self, file_prefix):
102
+ """
103
+ Saves two files: file_prefix.vocab and file_prefix.model
104
+ This is inspired (but not equivalent to!) sentencepiece's model saving:
105
+ - model file is the critical one, intended for load()
106
+ - vocab file is just a pretty printed version for human inspection only
107
+ """
108
+ # write the model: to be used in load() later
109
+ model_file = file_prefix + ".model"
110
+ with open(model_file, 'w') as f:
111
+ # write the version, pattern and merges, that's all that's needed
112
+ f.write("minbpe v1\n")
113
+ f.write(f"{self.pattern}\n")
114
+ # write the special tokens, first the number of them, then each one
115
+ f.write(f"{len(self.special_tokens)}\n")
116
+ for special, idx in self.special_tokens.items():
117
+ f.write(f"{special} {idx}\n")
118
+ # the merges dict
119
+ for idx1, idx2 in self.merges:
120
+ f.write(f"{idx1} {idx2}\n")
121
+ # write the vocab: for the human to look at
122
+ vocab_file = file_prefix + ".vocab"
123
+ inverted_merges = {idx: pair for pair, idx in self.merges.items()}
124
+ with open(vocab_file, "w", encoding="utf-8") as f:
125
+ for idx, token in self.vocab.items():
126
+ # note: many tokens may be partial utf-8 sequences
127
+ # and cannot be decoded into valid strings. Here we're using
128
+ # errors='replace' to replace them with the replacement char �.
129
+ # this also means that we couldn't possibly use .vocab in load()
130
+ # because decoding in this way is a lossy operation!
131
+ s = render_token(token)
132
+ # find the children of this token, if any
133
+ if idx in inverted_merges:
134
+ # if this token has children, render it nicely as a merge
135
+ idx0, idx1 = inverted_merges[idx]
136
+ s0 = render_token(self.vocab[idx0])
137
+ s1 = render_token(self.vocab[idx1])
138
+ f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
139
+ else:
140
+ # otherwise this is leaf token, just print it
141
+ # (this should just be the first 256 tokens, the bytes)
142
+ f.write(f"[{s}] {idx}\n")
143
+
144
+ def load(self, model_file):
145
+ """Inverse of save() but only for the model file"""
146
+ assert model_file.endswith(".model")
147
+ # read the model file
148
+ merges = {}
149
+ special_tokens = {}
150
+ idx = 256
151
+ with open(model_file, 'r', encoding="utf-8") as f:
152
+ # read the version
153
+ version = f.readline().strip()
154
+ assert version == "minbpe v1"
155
+ # read the pattern
156
+ self.pattern = f.readline().strip()
157
+ # read the special tokens
158
+ num_special = int(f.readline().strip())
159
+ for _ in range(num_special):
160
+ special, special_idx = f.readline().strip().split()
161
+ special_tokens[special] = int(special_idx)
162
+ # read the merges
163
+ for line in f:
164
+ idx1, idx2 = map(int, line.split())
165
+ merges[(idx1, idx2)] = idx
166
+ idx += 1
167
+ self.merges = merges
168
+ self.special_tokens = special_tokens
169
+ self.vocab = self._build_vocab()
minbpe/basic.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ But:
8
+ - Does not handle the regular expression splitting pattern.
9
+ - Does not handle any special tokens.
10
+ """
11
+
12
+ from .base import Tokenizer, get_stats, merge, get_compression_ratio
13
+
14
+
15
+ class BasicTokenizer(Tokenizer):
16
+
17
+ def __init__(self):
18
+ super().__init__()
19
+
20
+ def train(self, text, vocab_size, verbose=False):
21
+ assert vocab_size >= 256
22
+ num_merges = vocab_size - 256
23
+ tokens = text.encode("utf-8") # raw bytes
24
+ # input text preprocessing
25
+ text_bytes = text.encode("utf-8") # raw bytes
26
+ ids = list(text_bytes) # list of integers in range 0..255
27
+
28
+ # iteratively merge the most common pairs to create new tokens
29
+ merges = {} # (int, int) -> int
30
+ vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
31
+ for i in range(num_merges):
32
+ # count up the number of times every consecutive pair appears
33
+ stats = get_stats(ids)
34
+ # find the pair with the highest count
35
+ pair = max(stats, key=stats.get)
36
+ # mint a new token: assign it the next available id
37
+ idx = 256 + i
38
+ # replace all occurrences of pair in ids with idx
39
+ ids = merge(ids, pair, idx)
40
+ # save the merge
41
+ merges[pair] = idx
42
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
43
+ # prints
44
+ if verbose:
45
+ print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
46
+ print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
47
+
48
+ # save class variables
49
+ self.merges = merges # used in encode()
50
+ self.vocab = vocab # used in decode()
51
+
52
+ def decode(self, ids):
53
+ # given ids (list of integers), return Python string
54
+ text_bytes = b"".join(self.vocab[idx] for idx in ids)
55
+ text = text_bytes.decode("utf-8", errors="replace")
56
+ return text
57
+
58
+ def encode(self, text):
59
+ # given a string text, return the token ids
60
+ text_bytes = text.encode("utf-8") # raw bytes
61
+ ids = list(text_bytes) # list of integers in range 0..255
62
+ while len(ids) >= 2:
63
+ # find the pair with the lowest merge index
64
+ stats = get_stats(ids)
65
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
66
+ # subtle: if there are no more merges available, the key will
67
+ # result in an inf for every single pair, and the min will be
68
+ # just the first pair in the list, arbitrarily
69
+ # we can detect this terminating case by a membership check
70
+ if pair not in self.merges:
71
+ break # nothing else can be merged anymore
72
+ # otherwise let's merge the best pair (lowest merge index)
73
+ idx = self.merges[pair]
74
+ ids = merge(ids, pair, idx)
75
+ return ids
minbpe/regex.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minimal (byte-level) Byte Pair Encoding tokenizer.
3
+
4
+ Algorithmically follows along the GPT tokenizer:
5
+ https://github.com/openai/gpt-2/blob/master/src/encoder.py
6
+
7
+ Unlike BasicTokenizer:
8
+ - RegexTokenizer handles an optional regex splitting pattern.
9
+ - RegexTokenizer handles optional special tokens.
10
+ """
11
+
12
+ import regex as re
13
+ from .base import Tokenizer, get_stats, merge
14
+
15
+
16
+ # the main GPT text split patterns, see
17
+ # https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
18
+ GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
19
+ GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
20
+
21
+
22
+ class RegexTokenizer(Tokenizer):
23
+
24
+ def __init__(self, pattern=None):
25
+ """
26
+ - pattern: optional string to override the default (GPT-4 split pattern)
27
+ - special_tokens: str -> int dictionary of special tokens
28
+ example: {'<|endoftext|>': 100257}
29
+ """
30
+ super().__init__()
31
+ self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
32
+ self.compiled_pattern = re.compile(self.pattern)
33
+ self.special_tokens = {}
34
+ self.inverse_special_tokens = {}
35
+
36
+ def train(self, text, vocab_size, verbose=False):
37
+ assert vocab_size >= 256
38
+ num_merges = vocab_size - 256
39
+ tokens = text.encode("utf-8")
40
+
41
+ # split the text up into text chunks
42
+ text_chunks = re.findall(self.compiled_pattern, text)
43
+
44
+ # input text preprocessing
45
+ ids = [list(ch.encode("utf-8")) for ch in text_chunks]
46
+
47
+ # iteratively merge the most common pairs to create new tokens
48
+ merges = {} # (int, int) -> int
49
+ vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
50
+ for i in range(num_merges):
51
+ # count the number of times every consecutive pair appears
52
+ stats = {}
53
+ for chunk_ids in ids:
54
+ # passing in stats will update it in place, adding up counts
55
+ get_stats(chunk_ids, stats)
56
+ # find the pair with the highest count
57
+ pair = max(stats, key=stats.get)
58
+ # mint a new token: assign it the next available id
59
+ idx = 256 + i
60
+ # replace all occurrences of pair in ids with idx
61
+ ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
62
+ # save the merge
63
+ merges[pair] = idx
64
+ vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
65
+ # prints
66
+ if verbose:
67
+ print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
68
+ print(f"compression ratio: {len(tokens) / len(ids):.2f}X")
69
+
70
+ # save class variables
71
+ self.merges = merges # used in encode()
72
+ self.vocab = vocab # used in decode()
73
+
74
+ def register_special_tokens(self, special_tokens):
75
+ # special_tokens is a dictionary of str -> int
76
+ # example: {"<|endoftext|>": 100257}
77
+ self.special_tokens = special_tokens
78
+ self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
79
+
80
+ def decode(self, ids):
81
+ # given ids (list of integers), return Python string
82
+ part_bytes = []
83
+ for idx in ids:
84
+ if idx in self.vocab:
85
+ part_bytes.append(self.vocab[idx])
86
+ elif idx in self.inverse_special_tokens:
87
+ part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
88
+ else:
89
+ raise ValueError(f"invalid token id: {idx}")
90
+ text_bytes = b"".join(part_bytes)
91
+ text = text_bytes.decode("utf-8", errors="replace")
92
+ return text
93
+
94
+ def _encode_chunk(self, text_bytes):
95
+ # return the token ids
96
+ # let's begin. first, convert all bytes to integers in range 0..255
97
+ ids = list(text_bytes)
98
+ while len(ids) >= 2:
99
+ # find the pair with the lowest merge index
100
+ stats = get_stats(ids)
101
+ pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
102
+ # subtle: if there are no more merges available, the key will
103
+ # result in an inf for every single pair, and the min will be
104
+ # just the first pair in the list, arbitrarily
105
+ # we can detect this terminating case by a membership check
106
+ if pair not in self.merges:
107
+ break # nothing else can be merged anymore
108
+ # otherwise let's merge the best pair (lowest merge index)
109
+ idx = self.merges[pair]
110
+ ids = merge(ids, pair, idx)
111
+ return ids
112
+
113
+ def encode_ordinary(self, text):
114
+ """Encoding that ignores any special tokens."""
115
+ # split text into chunks of text by categories defined in regex pattern
116
+ text_chunks = re.findall(self.compiled_pattern, text)
117
+ # all chunks of text are encoded separately, then results are joined
118
+ ids = []
119
+ for chunk in text_chunks:
120
+ chunk_bytes = chunk.encode("utf-8") # raw bytes
121
+ chunk_ids = self._encode_chunk(chunk_bytes)
122
+ ids.extend(chunk_ids)
123
+ return ids
124
+
125
+ def encode(self, text, allowed_special="none_raise"):
126
+ """
127
+ Unlike encode_ordinary, this function handles special tokens.
128
+ allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
129
+ if none_raise, then an error is raised if any special token is encountered in text
130
+ this is the default tiktoken behavior right now as well
131
+ any other behavior is either annoying, or a major footgun
132
+ """
133
+ # decode the user desire w.r.t. handling of special tokens
134
+ special = None
135
+ if allowed_special == "all":
136
+ special = self.special_tokens
137
+ elif allowed_special == "none":
138
+ special = {}
139
+ elif allowed_special == "none_raise":
140
+ special = {}
141
+ assert all(token not in text for token in self.special_tokens)
142
+ elif isinstance(allowed_special, set):
143
+ special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
144
+ else:
145
+ raise ValueError(f"allowed_special={allowed_special} not understood")
146
+ if not special:
147
+ # shortcut: if no special tokens, just use the ordinary encoding
148
+ return self.encode_ordinary(text)
149
+ # otherwise, we have to be careful with potential special tokens in text
150
+ # we handle special tokens by splitting the text
151
+ # based on the occurrence of any exact match with any of the special tokens
152
+ # we can use re.split for this. note that surrounding the pattern with ()
153
+ # makes it into a capturing group, so the special tokens will be included
154
+ special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
155
+ special_chunks = re.split(special_pattern, text)
156
+ # now all the special characters are separated from the rest of the text
157
+ # all chunks of text are encoded separately, then results are joined
158
+ ids = []
159
+ for part in special_chunks:
160
+ if part in special:
161
+ # this is a special token, encode it separately as a special case
162
+ ids.append(special[part])
163
+ else:
164
+ # this is an ordinary sequence, encode it normally
165
+ ids.extend(self.encode_ordinary(part))
166
+ return ids
models/basic.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:193b5a8dc3085e8a380536c20e22960d73f9457fb1b7e41f4ccd51d2edc88f20
3
+ size 39636
models/basic.vocab ADDED
The diff for this file is too large to render. See raw diff
 
models/regex.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d1f1770d291d75edddc74c10f51140e469b2961aeb4089a6a888e5ec13b6f2
3
+ size 2155
models/regex.vocab ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [\u0000] 0
2
+ [\u0001] 1
3
+ [\u0002] 2
4
+ [\u0003] 3
5
+ [\u0004] 4
6
+ [\u0005] 5
7
+ [\u0006] 6
8
+ [\u0007] 7
9
+ [\u0008] 8
10
+ [\u0009] 9
11
+ [\u000a] 10
12
+ [\u000b] 11
13
+ [\u000c] 12
14
+ [\u000d] 13
15
+ [\u000e] 14
16
+ [\u000f] 15
17
+ [\u0010] 16
18
+ [\u0011] 17
19
+ [\u0012] 18
20
+ [\u0013] 19
21
+ [\u0014] 20
22
+ [\u0015] 21
23
+ [\u0016] 22
24
+ [\u0017] 23
25
+ [\u0018] 24
26
+ [\u0019] 25
27
+ [\u001a] 26
28
+ [\u001b] 27
29
+ [\u001c] 28
30
+ [\u001d] 29
31
+ [\u001e] 30
32
+ [\u001f] 31
33
+ [ ] 32
34
+ [!] 33
35
+ ["] 34
36
+ [#] 35
37
+ [$] 36
38
+ [%] 37
39
+ [&] 38
40
+ ['] 39
41
+ [(] 40
42
+ [)] 41
43
+ [*] 42
44
+ [+] 43
45
+ [,] 44
46
+ [-] 45
47
+ [.] 46
48
+ [/] 47
49
+ [0] 48
50
+ [1] 49
51
+ [2] 50
52
+ [3] 51
53
+ [4] 52
54
+ [5] 53
55
+ [6] 54
56
+ [7] 55
57
+ [8] 56
58
+ [9] 57
59
+ [:] 58
60
+ [;] 59
61
+ [<] 60
62
+ [=] 61
63
+ [>] 62
64
+ [?] 63
65
+ [@] 64
66
+ [A] 65
67
+ [B] 66
68
+ [C] 67
69
+ [D] 68
70
+ [E] 69
71
+ [F] 70
72
+ [G] 71
73
+ [H] 72
74
+ [I] 73
75
+ [J] 74
76
+ [K] 75
77
+ [L] 76
78
+ [M] 77
79
+ [N] 78
80
+ [O] 79
81
+ [P] 80
82
+ [Q] 81
83
+ [R] 82
84
+ [S] 83
85
+ [T] 84
86
+ [U] 85
87
+ [V] 86
88
+ [W] 87
89
+ [X] 88
90
+ [Y] 89
91
+ [Z] 90
92
+ [[] 91
93
+ [\] 92
94
+ []] 93
95
+ [^] 94
96
+ [_] 95
97
+ [`] 96
98
+ [a] 97
99
+ [b] 98
100
+ [c] 99
101
+ [d] 100
102
+ [e] 101
103
+ [f] 102
104
+ [g] 103
105
+ [h] 104
106
+ [i] 105
107
+ [j] 106
108
+ [k] 107
109
+ [l] 108
110
+ [m] 109
111
+ [n] 110
112
+ [o] 111
113
+ [p] 112
114
+ [q] 113
115
+ [r] 114
116
+ [s] 115
117
+ [t] 116
118
+ [u] 117
119
+ [v] 118
120
+ [w] 119
121
+ [x] 120
122
+ [y] 121
123
+ [z] 122
124
+ [{] 123
125
+ [|] 124
126
+ [}] 125
127
+ [~] 126
128
+ [\u007f] 127
129
+ [�] 128
130
+ [�] 129
131
+ [�] 130
132
+ [�] 131
133
+ [�] 132
134
+ [�] 133
135
+ [�] 134
136
+ [�] 135
137
+ [�] 136
138
+ [�] 137
139
+ [�] 138
140
+ [�] 139
141
+ [�] 140
142
+ [�] 141
143
+ [�] 142
144
+ [�] 143
145
+ [�] 144
146
+ [�] 145
147
+ [�] 146
148
+ [�] 147
149
+ [�] 148
150
+ [�] 149
151
+ [�] 150
152
+ [�] 151
153
+ [�] 152
154
+ [�] 153
155
+ [�] 154
156
+ [�] 155
157
+ [�] 156
158
+ [�] 157
159
+ [�] 158
160
+ [�] 159
161
+ [�] 160
162
+ [�] 161
163
+ [�] 162
164
+ [�] 163
165
+ [�] 164
166
+ [�] 165
167
+ [�] 166
168
+ [�] 167
169
+ [�] 168
170
+ [�] 169
171
+ [�] 170
172
+ [�] 171
173
+ [�] 172
174
+ [�] 173
175
+ [�] 174
176
+ [�] 175
177
+ [�] 176
178
+ [�] 177
179
+ [�] 178
180
+ [�] 179
181
+ [�] 180
182
+ [�] 181
183
+ [�] 182
184
+ [�] 183
185
+ [�] 184
186
+ [�] 185
187
+ [�] 186
188
+ [�] 187
189
+ [�] 188
190
+ [�] 189
191
+ [�] 190
192
+ [�] 191
193
+ [�] 192
194
+ [�] 193
195
+ [�] 194
196
+ [�] 195
197
+ [�] 196
198
+ [�] 197
199
+ [�] 198
200
+ [�] 199
201
+ [�] 200
202
+ [�] 201
203
+ [�] 202
204
+ [�] 203
205
+ [�] 204
206
+ [�] 205
207
+ [�] 206
208
+ [�] 207
209
+ [�] 208
210
+ [�] 209
211
+ [�] 210
212
+ [�] 211
213
+ [�] 212
214
+ [�] 213
215
+ [�] 214
216
+ [�] 215
217
+ [�] 216
218
+ [�] 217
219
+ [�] 218
220
+ [�] 219
221
+ [�] 220
222
+ [�] 221
223
+ [�] 222
224
+ [�] 223
225
+ [�] 224
226
+ [�] 225
227
+ [�] 226
228
+ [�] 227
229
+ [�] 228
230
+ [�] 229
231
+ [�] 230
232
+ [�] 231
233
+ [�] 232
234
+ [�] 233
235
+ [�] 234
236
+ [�] 235
237
+ [�] 236
238
+ [�] 237
239
+ [�] 238
240
+ [�] 239
241
+ [�] 240
242
+ [�] 241
243
+ [�] 242
244
+ [�] 243
245
+ [�] 244
246
+ [�] 245
247
+ [�] 246
248
+ [�] 247
249
+ [�] 248
250
+ [�] 249
251
+ [�] 250
252
+ [�] 251
253
+ [�] 252
254
+ [�] 253
255
+ [�] 254
256
+ [�] 255
257
+ [�][�] -> [�] 256
258
+ [ ][�] -> [ �] 257
259
+ [�][�] -> [�] 258
260
+ [�][�] -> [ा] 259
261
+ [�][�] -> [े] 260
262
+ [�][�] -> [र] 261
263
+ [ �][�] -> [ क] 262
264
+ [�][�] -> [्] 263
265
+ [�][�] -> [न] 264
266
+ [�][�] -> [ि] 265
267
+ [�][�] -> [ो] 266
268
+ [्][�] -> [्�] 267
269
+ [�][�] -> [ं] 268
270
+ [ा][�] -> [ा�] 269
271
+ [�][�] -> [ी] 270
272
+ [�][�] -> [ु] 271
273
+ [�][�] -> [स] 272
274
+ [�][�] -> [ह] 273
275
+ [ �][�] -> [ ह] 274
276
+ [�][�] -> [क] 275
277
+ [�][�] -> [त] 276
278
+ [ �][�] -> [ प] 277
279
+ [ �][�] -> [ स] 278
280
+ [ �][�] -> [ म] 279
281
+ [�][�] -> [म] 280
282
+ [�][�] -> [ै] 281
283
+ [ि][�] -> [ि�] 282
284
+ [ �][�] -> [ उ] 283
285
+ [�][र] -> [�र] 284
286
+ [ �][�] -> [ ज] 285
287
+ [ �][�] -> [ त] 286
288
+ [�][�] -> [।] 287
289
+ [ �][�] -> [ न] 288
290
+ [ �][�र] -> [ और] 289
291
+ [े][ं] -> [ें] 290
292
+ [ो][ं] -> [ों] 291
293
+ [ �][�] -> [ व] 292
294
+ [ �][�] -> [ द] 293
295
+ [ु][�] -> [ु�] 294
296
+ [ा][र] -> [ार] 295
297
+ [ �][�] -> [ ब] 296
298
+ [।][\u000a] -> [।\u000a] 297
299
+ [्�][�] -> [्य] 298
300
+ [े][�] -> [े�] 299
301
+ [�][�] -> [ू] 300
302
+ [ उ][स] -> [ उस] 301
303
+ [्][र] -> [्र] 302
304
+ [�][�] -> [ग] 303
305
+ [�][�] -> [ल] 304
306
+ [�][�] -> [�] 305
307
+ [ �][�] -> [ ल] 306
308
+ [ �][�] -> [ अ] 307
309
+ [ा][,] -> [ा,] 308
310
+ [ प][र] -> [ पर] 309
311
+ [�][�] -> [प] 310
312
+ [ि�][�] -> [िय] 311
313
+ [ �][�] -> [ य] 312
314
+ [्�][�] -> [्व] 313
315
+ [�][�] -> [ब] 314
316
+ [ �][�] -> [ भ] 315
317
+ [्�][�] -> [्त] 316
318
+ [�][�] -> [य] 317
319
+ [ क][र] -> [ कर] 318
320
+ [ �][�] -> [ आ] 319
321
+ [ा][न] -> [ान] 320
322
+ [ै][ं] -> [ैं] 321
323
+ [�][�] -> [़] 322
324
+ [�][�] -> [व] 323
325
+ [ �][�] -> [ र] 324
326
+ [�][�] -> [द] 325
327
+ [ु][म] -> [ुम] 326
328
+ [ा�][�] -> [ात] 327
329
+ [ क][ह] -> [ कह] 328
330
+ [ �][�] -> [ ग] 329
331
+ [ �][�] -> [ च] 330
332
+ [�][�] -> [ँ] 331
333
+ [ उ][न] -> [ उन] 332
334
+ [ व][ह] -> [ वह] 333
335
+ [ �][�] -> [ थ] 334
336
+ [्�][�] -> [्ह] 335
337
+ [�][�] -> [ड] 336
338
+ [क][र] -> [कर] 337
339
+ [�][�] -> [“] 338
340
+ [ी][ं] -> [ीं] 339
341
+ [ा�][�] -> [ास] 340
342
+ [�][�] -> [च] 341
343
+ [�][�] -> [ज] 342
344
+ [ै][,] -> [ै,] 343
345
+ [�][�] -> [श] 344
346
+ [्व][र] -> [्वर] 345
347
+ [�][�] -> [”] 346
348
+ [ उस][क] -> [ उसक] 347
349
+ [प][न] -> [पन] 348
350
+ [े�][�] -> [ेश] 349
351
+ [ा�][�] -> [ाल] 350
352
+ [ ][“] -> [ “] 351
353
+ [े][र] -> [ेर] 352
354
+ [ अ][पन] -> [ अपन] 353
355
+ [ि][त] -> [ित] 354
356
+ [ न][ह] -> [ नह] 355
357
+ [े][,] -> [े,] 356
358
+ [�][�र] -> [और] 357
359
+ [ा][।\u000a] -> [ा।\u000a] 358
360
+ [�][स] -> [�स] 359
361
+ [ य][ह] -> [ यह] 360
362
+ [ि][स] -> [िस] 361
363
+ [ि][न] -> [िन] 362
364
+ [ि][र] -> [िर] 363
365
+ [ ह][म] -> [ हम] 364
366
+ [”][\u000a] -> [”\u000a] 365
367
+ [ू][ँ] -> [ूँ] 366
368
+ [ा�][�] -> [ाँ] 367
369
+ [ र][ह] -> [ रह] 368
370
+ [ा�][�] -> [ाम] 369
371
+ [ पर][म] -> [ परम] 370
372
+ [ा�][�] -> [ाए] 371
373
+ [ो][�] -> [ो�] 372
374
+ [ा�][�] -> [ाह] 373
375
+ [ �][�] -> [ ए] 374
376
+ [�][�] -> [भ] 375
377
+ [ो][ग] -> [ोग] 376
378
+ [ ए][क] -> [ एक] 377
379
+ [्�][�] -> [्म] 378
380
+ [।][”\u000a] -> [।”\u000a] 379
381
+ [ म][न] -> [ मन] 380
382
+ [�][�] -> [ट] 381
383
+ [ि][क] -> [िक] 382
384
+ [ स][म] -> [ सम] 383
385
+ [ �][�स] -> [ इस] 384
386
+ [ा�][�] -> [ाय] 385
387
+ [्�][�] -> [्थ] 386
388
+ [ा�][�] -> [ाथ] 387
389
+ [ु�][�] -> [ुझ] 388
390
+ [ �][�] -> [ फ] 389
391
+ [ �][�] -> [ ख] 390
392
+ [ी][श] -> [ीश] 391
393
+ [ �][�] -> [ श] 392
394
+ [�][�] -> [ख] 393
395
+ [े�][�] -> [ेव] 394
396
+ [ स][ब] -> [ सब] 395
397
+ [ कर][त] -> [ करत] 396
398
+ [ो][,] -> [ो,] 397
399
+ [ि�][�] -> [िए] 398
400
+ [ु][त] -> [ुत] 399
401
+ [ु][स] -> [ुस] 400
402
+ [�][�] -> [ध] 401
403
+ [र][न] -> [रन] 402
404
+ [े�][�] -> [ेख] 403
405
+ [ै][।\u000a] -> [ै।\u000a] 404
406
+ [ु�][�] -> [ुष] 405
407
+ [�][�] -> [ण] 406
408
+ [ा�][�] -> [ाक] 407
409
+ [�][�] -> [ठ] 408
410
+ [ु][न] -> [ुन] 409
411
+ [्�][�] -> [्द] 410
412
+ [�][�] -> [ौ] 411
413
+ [ ग][य] -> [ गय] 412
414
+ [ प][ह] -> [ पह] 413
415
+ [ी][,] -> [ी,] 414
416
+ [ि�][�] -> [िल] 415
417
+ [ि�][�] -> [िश] 416
418
+ [ अ][न] -> [ अन] 417
419
+ [�][�] -> [ए] 418
420
+ [े][।\u000a] -> [े।\u000a] 419
421
+ [ उस][न] -> [ उसन] 420
422
+ [ै][स] -> [ैस] 421
423
+ [ ल][ग] -> [ लग] 422
424
+ [ैं][,] -> [ैं,] 423
425
+ [ �][�] -> [ ध] 424
426
+ [़][ा] -> [़ा] 425
427
+ [्र][भ] -> [्रभ] 426
428
+ [ कर][न] -> [ करन] 427
429
+ [ उन][क] -> [ उनक] 428
430
+ [ा�][�] -> [ाज] 429
431
+ [ पर][न] -> [ परन] 430
432
+ [�][�] -> [ृ] 431
433
+ [ो�][�] -> [ोई] 432
434
+ [�][�] -> [उ] 433
435
+ [ �][�] -> [ छ] 434
436
+ [ा�][�] -> [ाई] 435
437
+ [ �][�] -> [ घ] 436
438
+ [ु�][�] -> [ुछ] 437
439
+ [ी][ह] -> [ीह] 438
440
+ [ू][र] -> [ूर] 439
441
+ [ु�][�] -> [ुआ] 440
442
+ [ ब][ह] -> [ बह] 441
443
+ [ म][स] -> [ मस] 442
444
+ [ र][ख] -> [ रख] 443
445
+ [्][न] -> [्न] 444
446
+ [ि�][�] -> [िष] 445
447
+ [ उस][स] -> [ उसस] 446
448
+ [ �][�] -> [ ड] 447
449
+ [ ज][ब] -> [ जब] 448
450
+ [ु�][�] -> [ुए] 449
451
+ [ूँ][,] -> [ूँ,] 450
452
+ [ा][;] -> [ा;] 451
453
+ [�][�स] -> [इस] 452
454
+ [ �][�] -> [ इ] 453
455
+ [ कह][त] -> [ कहत] 454
456
+ [ा�][�] -> [ाओ] 455
457
+ [ो][त] -> [ोत] 456
458
+ [ा�][�] -> [ाप] 457
459
+ [�][�] -> [ढ] 458
460
+ [्�][�] -> [्ग] 459
461
+ [ आ][त] -> [ आत] 460
462
+ [े�][�] -> [ेग] 461
463
+ [े][त] -> [ेत] 462
464
+ [ उ][त] -> [ उत] 463
465
+ [्�][�] -> [्ध] 464
466
+ [ �][�] -> [ ठ] 465
467
+ [ त][क] -> [ तक] 466
468
+ [ू][स] -> [ूस] 467
469
+ [ाक][र] -> [ाकर] 468
470
+ [ो][न] -> [ोन] 469
471
+ [ि�][�] -> [िख] 470
472
+ [े][म] -> [ेम] 471
473
+ [ान][त] -> [ानत] 472
474
+ [�][स] -> [�स] 473
475
+ [ु][र] -> [ुर] 474
476
+ [़][े] -> [़े] 475
477
+ [त][ब] -> [तब] 476
478
+ [ ब][न] -> [ बन] 477
479
+ [�][�] -> [ई] 478
480
+ [ �][�स] -> [ ऐस] 479
481
+ [ो][ड] -> [ोड] 480
482
+ [प][रन] -> [परन] 481
483
+ [ह][र] -> [हर] 482
484
+ [े�][�] -> [ेल] 483
485
+ [्�][�] -> [्ञ] 484
486
+ [उ][स] -> [उस] 485
487
+ [ा�][�] -> [ाद] 486
488
+ [ य][द] -> [ यद] 487
489
+ [ी][न] -> [ीन] 488
490
+ [ ग][ए] -> [ गए] 489
491
+ [ च][ल] -> [ चल] 490
492
+ [ उ][ठ] -> [ उठ] 491
493
+ [ प][त] -> [ पत] 492
494
+ [�][�] -> [छ] 493
495
+ [ ब][ड] -> [ बड] 494
496
+ [्�][�] -> [्ष] 495
497
+ [�][�] -> [अ] 496
498
+ [प][र] -> [पर] 497
499
+ [ उन][स] -> [ उनस] 498
500
+ [ीं][,] -> [ीं,] 499
501
+ [ो][।\u000a] -> [ो।\u000a] 500
502
+ [ �][�] -> [ ओ] 501
503
+ [ी][त] -> [ीत] 502
504
+ [ार][ण] -> [ारण] 503
505
+ [ प][व] -> [ पव] 504
506
+ [ु�][�] -> [ुँ] 505
507
+ [ों][,] -> [ों,] 506
508
+ [ै][;] -> [ै;] 507
509
+ [ अ][ध] -> [ अध] 508
510
+ [ स][क] -> [ सक] 509
511
+ [ आ][प] -> [ आप] 510
512
+ [ज][ब] -> [जब] 511