suayptalha commited on
Commit
048f254
·
verified ·
1 Parent(s): 0596bc8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +176 -0
README.md CHANGED
@@ -26,4 +26,180 @@ text = "" #Text with typos here!
26
  inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
27
  outputs = model.generate(inputs["input_ids"], max_length=256)
28
  corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  ```
 
26
  inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
27
  outputs = model.generate(inputs["input_ids"], max_length=256)
28
  corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
29
+ ```
30
+
31
+
32
+ Full Pipeline Usage:
33
+ ```py
34
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
35
+ import torch
36
+ from string import ascii_lowercase
37
+ import Levenshtein
38
+ import random
39
+
40
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng")
43
+ alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device)
44
+ correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device)
45
+
46
+ def similarity_percentage(s1, s2):
47
+ distance = Levenshtein.distance(s1, s2)
48
+
49
+ max_len = max(len(s1), len(s2))
50
+
51
+ similarity = (1 - distance / max_len) * 100
52
+
53
+ return similarity
54
+
55
+ def decode(cipher_text, key):
56
+ decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])}
57
+ decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])})
58
+ ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text))
59
+ return ans
60
+
61
+ def model_pass(model, input, max_length=256):
62
+ inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
63
+ outputs = model.generate(inputs["input_ids"], max_length=max_length)
64
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
65
+ return result
66
+
67
+ def decipher(cipher_text, key) -> str:
68
+ decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])}
69
+ decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])})
70
+
71
+ result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0]))
72
+
73
+ return result
74
+
75
+ def cipher(plain_text) -> tuple[str, list]:
76
+ alphabet_map = list(ascii_lowercase)
77
+ random.shuffle(alphabet_map)
78
+ alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)}
79
+
80
+ alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()})
81
+
82
+ cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text))
83
+ return cipher_text, alphabet_map
84
+
85
+ def correct_text(cipher_text, model_output):
86
+ cipher_text = cipher_text.split(' ')
87
+ model_output = model_output.split(' ')
88
+
89
+ letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase}
90
+
91
+
92
+ # Levenstein distance for lenghts of words
93
+ n = len(cipher_text)
94
+ m = len(model_output)
95
+
96
+ i = 0
97
+ j = 0
98
+ dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)]
99
+
100
+ for i in range(n + 1):
101
+ dp[i][0] = i
102
+
103
+
104
+ for j in range(m + 1):
105
+ dp[0][j] = j
106
+
107
+ for i in range(1, n + 1):
108
+ for j in range(1, m + 1):
109
+ if len(cipher_text[i - 1]) == len(model_output[j - 1]):
110
+ dp[i][j] = dp[i - 1][j - 1]
111
+
112
+ else:
113
+ dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
114
+
115
+ i = n
116
+ j = m
117
+ while i > 0 and j > 0:
118
+
119
+ before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1])
120
+ match before[0]:
121
+ case 0:
122
+ if dp[i - 1][j - 1] == dp[i][j]:
123
+ # If the same we add them to letter map
124
+ cipher = cipher_text[i-1]
125
+ model_o = model_output[j-1]
126
+
127
+ for c_letter, m_letter in zip(cipher.lower(), model_o.lower()):
128
+ if c_letter in letter_map and m_letter in letter_map[c_letter]:
129
+ letter_map[c_letter][m_letter] += 1
130
+
131
+ i = i - 1
132
+ j = j - 1
133
+ case 1:
134
+ i = i - 1
135
+ case 2:
136
+ j = j - 1
137
+
138
+ for letter in ascii_lowercase:
139
+ letter_sum = sum(letter_map[letter].values())
140
+ if letter_sum == 0:
141
+ # That letter wasn't in the text
142
+ letter_map[letter] = None
143
+ continue
144
+
145
+ # Sorted from most accuring to least
146
+ letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)]
147
+
148
+ change_map = {
149
+ i : None for i in ascii_lowercase
150
+ }
151
+
152
+ for i in range(len(ascii_lowercase)):
153
+ for letter in ascii_lowercase:
154
+ if letter_map[letter] is None:
155
+ continue # That letter wasn't in the text
156
+
157
+ # If None then it didn't get substituted earlier
158
+ map_letter = letter_map[letter][i][0]
159
+ if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None
160
+ or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))):
161
+ change_map[map_letter] = (letter, i, letter_map[letter][i][1])
162
+ # Letter, iteration, percentage
163
+
164
+ change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None}
165
+
166
+ for letter in ascii_lowercase:
167
+ if letter not in change_map:
168
+ change_map[letter] = '.'
169
+
170
+
171
+ # Add uppercases
172
+ change_map.update(
173
+ {
174
+ i[0].upper() : i[1].upper() for i in change_map.items()
175
+ }
176
+ )
177
+
178
+ new_text = []
179
+ for cipher in cipher_text:
180
+ new_word = ""
181
+ for c_letter in cipher:
182
+ if c_letter in change_map:
183
+ new_word += change_map[c_letter]
184
+
185
+ else:
186
+ new_word += c_letter
187
+
188
+
189
+ new_text.append(new_word)
190
+
191
+ return ' '.join(new_text)
192
+
193
+ def crack_sub(cipher_text):
194
+ output = model_pass(alphabet_model, cipher_text, 26)
195
+ decoded = decode(cipher_text, output)
196
+ second_pass = model_pass(correction_model, decoded, len(decoded))
197
+ second_text = correct_text(cipher_text, second_pass)
198
+ third_pass = model_pass(correction_model, second_text, len(decoded))
199
+
200
+ return third_pass
201
+
202
+ """
203
+ Use crack_sub() function to solve monoalphabetic substitution ciphers!
204
+ """
205
  ```