Update README.md
Browse files
README.md
CHANGED
@@ -26,4 +26,180 @@ text = "" #Text with typos here!
|
|
26 |
inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
|
27 |
outputs = model.generate(inputs["input_ids"], max_length=256)
|
28 |
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
```
|
|
|
26 |
inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
|
27 |
outputs = model.generate(inputs["input_ids"], max_length=256)
|
28 |
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
29 |
+
```
|
30 |
+
|
31 |
+
|
32 |
+
Full Pipeline Usage:
|
33 |
+
```py
|
34 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
35 |
+
import torch
|
36 |
+
from string import ascii_lowercase
|
37 |
+
import Levenshtein
|
38 |
+
import random
|
39 |
+
|
40 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
41 |
+
|
42 |
+
tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng")
|
43 |
+
alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device)
|
44 |
+
correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device)
|
45 |
+
|
46 |
+
def similarity_percentage(s1, s2):
|
47 |
+
distance = Levenshtein.distance(s1, s2)
|
48 |
+
|
49 |
+
max_len = max(len(s1), len(s2))
|
50 |
+
|
51 |
+
similarity = (1 - distance / max_len) * 100
|
52 |
+
|
53 |
+
return similarity
|
54 |
+
|
55 |
+
def decode(cipher_text, key):
|
56 |
+
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])}
|
57 |
+
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])})
|
58 |
+
ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text))
|
59 |
+
return ans
|
60 |
+
|
61 |
+
def model_pass(model, input, max_length=256):
|
62 |
+
inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
|
63 |
+
outputs = model.generate(inputs["input_ids"], max_length=max_length)
|
64 |
+
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
65 |
+
return result
|
66 |
+
|
67 |
+
def decipher(cipher_text, key) -> str:
|
68 |
+
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])}
|
69 |
+
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])})
|
70 |
+
|
71 |
+
result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0]))
|
72 |
+
|
73 |
+
return result
|
74 |
+
|
75 |
+
def cipher(plain_text) -> tuple[str, list]:
|
76 |
+
alphabet_map = list(ascii_lowercase)
|
77 |
+
random.shuffle(alphabet_map)
|
78 |
+
alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)}
|
79 |
+
|
80 |
+
alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()})
|
81 |
+
|
82 |
+
cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text))
|
83 |
+
return cipher_text, alphabet_map
|
84 |
+
|
85 |
+
def correct_text(cipher_text, model_output):
|
86 |
+
cipher_text = cipher_text.split(' ')
|
87 |
+
model_output = model_output.split(' ')
|
88 |
+
|
89 |
+
letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase}
|
90 |
+
|
91 |
+
|
92 |
+
# Levenstein distance for lenghts of words
|
93 |
+
n = len(cipher_text)
|
94 |
+
m = len(model_output)
|
95 |
+
|
96 |
+
i = 0
|
97 |
+
j = 0
|
98 |
+
dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
|
99 |
+
|
100 |
+
for i in range(n + 1):
|
101 |
+
dp[i][0] = i
|
102 |
+
|
103 |
+
|
104 |
+
for j in range(m + 1):
|
105 |
+
dp[0][j] = j
|
106 |
+
|
107 |
+
for i in range(1, n + 1):
|
108 |
+
for j in range(1, m + 1):
|
109 |
+
if len(cipher_text[i - 1]) == len(model_output[j - 1]):
|
110 |
+
dp[i][j] = dp[i - 1][j - 1]
|
111 |
+
|
112 |
+
else:
|
113 |
+
dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
|
114 |
+
|
115 |
+
i = n
|
116 |
+
j = m
|
117 |
+
while i > 0 and j > 0:
|
118 |
+
|
119 |
+
before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1])
|
120 |
+
match before[0]:
|
121 |
+
case 0:
|
122 |
+
if dp[i - 1][j - 1] == dp[i][j]:
|
123 |
+
# If the same we add them to letter map
|
124 |
+
cipher = cipher_text[i-1]
|
125 |
+
model_o = model_output[j-1]
|
126 |
+
|
127 |
+
for c_letter, m_letter in zip(cipher.lower(), model_o.lower()):
|
128 |
+
if c_letter in letter_map and m_letter in letter_map[c_letter]:
|
129 |
+
letter_map[c_letter][m_letter] += 1
|
130 |
+
|
131 |
+
i = i - 1
|
132 |
+
j = j - 1
|
133 |
+
case 1:
|
134 |
+
i = i - 1
|
135 |
+
case 2:
|
136 |
+
j = j - 1
|
137 |
+
|
138 |
+
for letter in ascii_lowercase:
|
139 |
+
letter_sum = sum(letter_map[letter].values())
|
140 |
+
if letter_sum == 0:
|
141 |
+
# That letter wasn't in the text
|
142 |
+
letter_map[letter] = None
|
143 |
+
continue
|
144 |
+
|
145 |
+
# Sorted from most accuring to least
|
146 |
+
letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)]
|
147 |
+
|
148 |
+
change_map = {
|
149 |
+
i : None for i in ascii_lowercase
|
150 |
+
}
|
151 |
+
|
152 |
+
for i in range(len(ascii_lowercase)):
|
153 |
+
for letter in ascii_lowercase:
|
154 |
+
if letter_map[letter] is None:
|
155 |
+
continue # That letter wasn't in the text
|
156 |
+
|
157 |
+
# If None then it didn't get substituted earlier
|
158 |
+
map_letter = letter_map[letter][i][0]
|
159 |
+
if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None
|
160 |
+
or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))):
|
161 |
+
change_map[map_letter] = (letter, i, letter_map[letter][i][1])
|
162 |
+
# Letter, iteration, percentage
|
163 |
+
|
164 |
+
change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None}
|
165 |
+
|
166 |
+
for letter in ascii_lowercase:
|
167 |
+
if letter not in change_map:
|
168 |
+
change_map[letter] = '.'
|
169 |
+
|
170 |
+
|
171 |
+
# Add uppercases
|
172 |
+
change_map.update(
|
173 |
+
{
|
174 |
+
i[0].upper() : i[1].upper() for i in change_map.items()
|
175 |
+
}
|
176 |
+
)
|
177 |
+
|
178 |
+
new_text = []
|
179 |
+
for cipher in cipher_text:
|
180 |
+
new_word = ""
|
181 |
+
for c_letter in cipher:
|
182 |
+
if c_letter in change_map:
|
183 |
+
new_word += change_map[c_letter]
|
184 |
+
|
185 |
+
else:
|
186 |
+
new_word += c_letter
|
187 |
+
|
188 |
+
|
189 |
+
new_text.append(new_word)
|
190 |
+
|
191 |
+
return ' '.join(new_text)
|
192 |
+
|
193 |
+
def crack_sub(cipher_text):
|
194 |
+
output = model_pass(alphabet_model, cipher_text, 26)
|
195 |
+
decoded = decode(cipher_text, output)
|
196 |
+
second_pass = model_pass(correction_model, decoded, len(decoded))
|
197 |
+
second_text = correct_text(cipher_text, second_pass)
|
198 |
+
third_pass = model_pass(correction_model, second_text, len(decoded))
|
199 |
+
|
200 |
+
return third_pass
|
201 |
+
|
202 |
+
"""
|
203 |
+
Use crack_sub() function to solve monoalphabetic substitution ciphers!
|
204 |
+
"""
|
205 |
```
|