Spaces:
Sleeping
Sleeping
Commit
·
c9f3e85
1
Parent(s):
850b586
Hindi regex cheat code applied
Browse files- BPE.py +2 -36
- decoded_output.txt +3 -3
- encode_decode.py +15 -39
BPE.py
CHANGED
@@ -7,41 +7,7 @@ with open('text_file.txt', 'r', encoding='utf-8') as file:
|
|
7 |
text = file.read()
|
8 |
|
9 |
# Hindi-focused pattern
|
10 |
-
gpt2pat = re.compile(r"""
|
11 |
-
# Simpler syllable-based grouping
|
12 |
-
(?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?) # Consonant + modifiers
|
13 |
-
# This part matches:
|
14 |
-
# - Any consonant [क-ह]
|
15 |
-
# - Optionally followed by:
|
16 |
-
# - maatras [ा-ौ] (like ा ि ी ु ू े ै ो ौ)
|
17 |
-
# - OR nukta (\u093C = ़)
|
18 |
-
# - OR chandrabindu (\u0901 = ँ)
|
19 |
-
# - OR anusvara (\u0902 = ं)
|
20 |
-
# - OR visarga (\u0903 = ः)
|
21 |
-
|
22 |
-
|[\u0905-\u0914] # Independent vowels
|
23 |
-
# Matches standalone vowels like अ आ इ ई उ ऊ ए ऐ ओ औ
|
24 |
-
|
25 |
-
|[क-ह]्[क-ह] # Basic conjuncts
|
26 |
-
# Matches basic consonant conjuncts:
|
27 |
-
# - First consonant + halant (्) + second consonant
|
28 |
-
# - Examples: क्क, न्न, त्त
|
29 |
-
|
30 |
-
|\p{N}+ # Numbers
|
31 |
-
# Matches one or more digits
|
32 |
-
|
33 |
-
|\s+ # Whitespace
|
34 |
-
# Matches spaces, tabs, newlines
|
35 |
-
|
36 |
-
|[।॥] # Punctuation
|
37 |
-
# Matches Hindi punctuation marks
|
38 |
-
|
39 |
-
|[^\s\p{Devanagari}\p{N}]+ # Other characters
|
40 |
-
# Matches any sequence of characters that aren't:
|
41 |
-
# - whitespace
|
42 |
-
# - Devanagari script
|
43 |
-
# - numbers
|
44 |
-
""", re.VERBOSE)
|
45 |
|
46 |
# Apply the regex pattern to the raw text to tokenize it
|
47 |
tokens = re.findall(gpt2pat, text)
|
@@ -84,7 +50,7 @@ def merge(token_list, pair, idx):
|
|
84 |
return newids
|
85 |
|
86 |
def perform_bpe():
|
87 |
-
vocab_size =
|
88 |
num_merges = vocab_size - 256
|
89 |
token_list = list(tokens) # copy so we don't destroy the original list
|
90 |
|
|
|
7 |
text = file.read()
|
8 |
|
9 |
# Hindi-focused pattern
|
10 |
+
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
# Apply the regex pattern to the raw text to tokenize it
|
13 |
tokens = re.findall(gpt2pat, text)
|
|
|
50 |
return newids
|
51 |
|
52 |
def perform_bpe():
|
53 |
+
vocab_size = 4000 # the desired final vocabulary size
|
54 |
num_merges = vocab_size - 256
|
55 |
token_list = list(tokens) # copy so we don't destroy the original list
|
56 |
|
decoded_output.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
1 |
+
अम ज द के परिवार की तीन पी ढ़ ियां चांद नी चौ क निर्� � ाचन क्षेत्र में ह वे ली आज ़ म ख ां के नाम से पहच ाने जाने वाले एक दम स ट कर बने घ रों के झ ुण ्ड में रहती हैं . यह इलाक ा दिल्ली की ऐ त िहास िक ज ामा मस्जिद से पै दल की द ूरी पर है , और इस परिवार के 23 सदस्य मतदान केंद्र 10 पर प ंज ीक ृत मत द ाता हैं . लेकिन पिछले साल लोकसभा चुनावों के दौरान अम ज द को पता चला कि वह अपने परिवार के उन 20 लोगों में से एक हैं , जिन का नाम मत द ाता सू ची से इस वजह से काट दिया गया कि उन्होंने अपना घर बदल लिया है .
|
2 |
+
|
3 |
+
5 5 वर्षीय अम ज द ने न्यू ज़ ल ॉन ्ड ्री को बताया , " हम ारे सामने ये पहली बार हुआ है . लेकिन नाम कट ने के बारे में सबसे ज्यादा निर ाश ाज न क बात ये थी कि इसका पता मतदान के दिन ही चला . जब हम पहली बार बू थ 10 पर गए तो उन्होंने हमें बताया कि उन्हें मत द ाता सू ची में हमारा नाम नहीं मिला . इसलिए हमें ज ामा मस्जिद में किसी दूसरे बू थ पर जाकर देखना चाहिए . वहां से हमें दूसरे बू थ पर भेज दिया गया . इस तरह हमने पांच से छह बू थ ों का दौर ा किया . और फिर अंत में हमें जो कारण बताया गया , वो यह था कि शायद घर - घ र जाकर सर्वे क्षण के दौरान बी एल ओ ( ब ू थ ले वल ऑफिस र ) को हम घर पर नहीं मिले इसलिए उन्होंने हमारे नाम काट दिए .”
|
encode_decode.py
CHANGED
@@ -7,47 +7,16 @@ with open('bpe_results.pkl', 'rb') as f:
|
|
7 |
merges, ids, num_merges = pickle.load(f)
|
8 |
|
9 |
# Define the GPT-2 regex pattern (same as in BPE.py)
|
10 |
-
gpt2pat = re.compile(r"""
|
11 |
-
# Simpler syllable-based grouping
|
12 |
-
(?:[\p{Devanagari}&&[क-ह]][ा-ौ\u093C\u0901-\u0903]?) # Consonant + modifiers
|
13 |
-
|[\u0905-\u0914] # Independent vowels
|
14 |
-
|[क-ह]्[क-ह] # Basic conjuncts
|
15 |
-
|\p{N}+ # Numbers
|
16 |
-
|\s+ # Whitespace
|
17 |
-
|[।॥] # Punctuation
|
18 |
-
|[^\s\p{Devanagari}\p{N}]+ # Other characters
|
19 |
-
""", re.VERBOSE)
|
20 |
|
21 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
22 |
for (p0, p1), idx in merges.items():
|
23 |
vocab[idx] = vocab[p0] + vocab[p1]
|
24 |
|
25 |
def decode(ids):
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
try:
|
30 |
-
char = byte_seq.decode('utf-8')
|
31 |
-
print(f"ID {idx}: bytes {list(byte_seq)} -> '{char}'")
|
32 |
-
except UnicodeDecodeError:
|
33 |
-
print(f"ID {idx}: bytes {list(byte_seq)} -> [INVALID UTF-8]")
|
34 |
-
|
35 |
-
print("\nDecoding sequence:")
|
36 |
-
tokens = []
|
37 |
-
for idx in ids:
|
38 |
-
if idx in vocab:
|
39 |
-
token_bytes = vocab[idx]
|
40 |
-
try:
|
41 |
-
char = token_bytes.decode('utf-8')
|
42 |
-
print(f"ID {idx} -> '{char}'")
|
43 |
-
except UnicodeDecodeError:
|
44 |
-
print(f"ID {idx} -> [INVALID UTF-8] {list(token_bytes)}")
|
45 |
-
tokens.append(token_bytes)
|
46 |
-
else:
|
47 |
-
print(f"Missing ID: {idx}")
|
48 |
-
|
49 |
-
# Original decoding logic
|
50 |
-
text = b''.join(tokens).decode('utf-8', errors='replace')
|
51 |
|
52 |
# Write the decoded text to a new file
|
53 |
with open('decoded_output.txt', 'w', encoding='utf-8') as f:
|
@@ -56,7 +25,7 @@ def decode(ids):
|
|
56 |
return text
|
57 |
|
58 |
# Example: Decode a list of IDs
|
59 |
-
set_of_ids = [
|
60 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
61 |
print(decoded_text)
|
62 |
|
@@ -72,6 +41,9 @@ def encode():
|
|
72 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
73 |
token_list = [list(token) for token in byte_tokens]
|
74 |
|
|
|
|
|
|
|
75 |
# Process each token
|
76 |
final_tokens = []
|
77 |
for token in token_list:
|
@@ -87,8 +59,12 @@ def encode():
|
|
87 |
current_token = merge([current_token], pair, idx)[0]
|
88 |
final_tokens.extend(current_token)
|
89 |
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
|
92 |
# Example: Encode text from a file
|
93 |
-
|
94 |
-
|
|
|
7 |
merges, ids, num_merges = pickle.load(f)
|
8 |
|
9 |
# Define the GPT-2 regex pattern (same as in BPE.py)
|
10 |
+
gpt2pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{N}+| ?(?:[\u0904-\u0939\u093d-\u093d\u0950-\u0950\u0958-\u0961\u0970-\u097f\ua8f2-\ua8fe\U00011b00-\U00011b09\u1cd3-\u1cd3\u1ce9-\u1cec\u1cee-\u1cf3\u1cf5-\u1cf6\u1cfa-\u1cfa][\u0900-\u0903\u093a-\u093c\u093e-\u094f\u0951-\u0957\u0962-\u0963\ua8e0-\ua8f1\ua8ff-\ua8ff\u1cd0-\u1cd2\u1cd4-\u1ce8\u1ced-\u1ced\u1cf4-\u1cf4\u1cf7-\u1cf9]*)+| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
vocab = {idx: bytes([idx]) for idx in range(256)}
|
13 |
for (p0, p1), idx in merges.items():
|
14 |
vocab[idx] = vocab[p0] + vocab[p1]
|
15 |
|
16 |
def decode(ids):
|
17 |
+
# given ids (list of integers), return Python string
|
18 |
+
tokens = [vocab[idx].decode("utf-8", errors="replace") for idx in ids]
|
19 |
+
text = '\t'.join(tokens) # Join tokens with tabs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
# Write the decoded text to a new file
|
22 |
with open('decoded_output.txt', 'w', encoding='utf-8') as f:
|
|
|
25 |
return text
|
26 |
|
27 |
# Example: Decode a list of IDs
|
28 |
+
set_of_ids = [2342, 307, 295, 286, 1413, 302, 839, 644, 574, 982, 3877, 405, 1086, 272, 978, 181, 3927, 1171, 294, 274, 964, 438, 767, 337, 284, 361, 332, 286, 776, 315, 2331, 429, 841, 631, 385, 1694, 273, 310, 418, 1607, 445, 935, 286, 962, 1244, 698, 294, 3069, 347, 46, 450, 1462, 259, 646, 302, 554, 276, 2252, 334, 292, 2835, 2500, 315, 1006, 3367, 302, 296, 1299, 330, 289, 44, 327, 345, 1413, 286, 2911, 1906, 2592, 1322, 888, 330, 279, 711, 1474, 997, 1068, 295, 1236, 347, 46, 513, 1067, 579, 1194, 2596, 286, 847, 732, 307, 295, 309, 1423, 1953, 340, 555, 563, 1413, 286, 376, 466, 596, 294, 315, 385, 347, 44, 1001, 478, 776, 1068, 295, 1236, 919, 1216, 315, 345, 1115, 315, 3189, 481, 437, 340, 557, 1125, 1135, 1501, 857, 289, 46, 10, 10, 53, 53, 2794, 732, 307, 295, 317, 2705, 2246, 280, 1308, 698, 486, 309, 739, 44, 32, 34, 808, 830, 1015, 516, 1315, 544, 667, 289, 46, 513, 776, 1914, 311, 286, 948, 294, 856, 915, 2438, 658, 367, 271, 272, 564, 516, 472, 340, 1571, 1423, 2592, 286, 638, 416, 1953, 46, 586, 462, 1315, 544, 3075, 583, 888, 330, 588, 444, 557, 1448, 739, 340, 737, 1068, 295, 1236, 919, 1216, 294, 3253, 776, 391, 1410, 46, 1496, 1448, 292, 2835, 2500, 294, 738, 1374, 3075, 583, 330, 2660, 3252, 904, 46, 1441, 315, 1448, 1374, 3075, 583, 330, 1473, 481, 437, 46, 345, 778, 1758, 1307, 315, 2210, 3075, 583, 299, 333, 751, 259, 420, 46, 327, 766, 1200, 294, 1448, 499, 1394, 739, 437, 44, 707, 450, 413, 340, 3602, 1135, 45, 864, 261, 2660, 2749, 1930, 286, 847, 447, 1782, 1633, 510, 308, 306, 583, 399, 1508, 2632, 261, 41, 309, 462, 1135, 330, 391, 1193, 1496, 557, 1574, 776, 3189, 1340, 3435]
|
29 |
decoded_text = decode(set_of_ids) # Pass the list of IDs
|
30 |
print(decoded_text)
|
31 |
|
|
|
41 |
byte_tokens = [token.encode('utf-8') for token in tokens]
|
42 |
token_list = [list(token) for token in byte_tokens]
|
43 |
|
44 |
+
# Calculate total bytes before compression
|
45 |
+
total_bytes_before = sum(len(token) for token in token_list)
|
46 |
+
|
47 |
# Process each token
|
48 |
final_tokens = []
|
49 |
for token in token_list:
|
|
|
59 |
current_token = merge([current_token], pair, idx)[0]
|
60 |
final_tokens.extend(current_token)
|
61 |
|
62 |
+
# Calculate compression ratio
|
63 |
+
compression_ratio = total_bytes_before / len(final_tokens)
|
64 |
+
print(f"Compression ratio: {compression_ratio:.2f}X")
|
65 |
+
|
66 |
+
return final_tokens, compression_ratio
|
67 |
|
68 |
# Example: Encode text from a file
|
69 |
+
encoded_tokens, ratio = encode()
|
70 |
+
print(f"Encoded tokens: {encoded_tokens}")
|