|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- agentlans/high-quality-english-sentences |
|
language: |
|
- en |
|
base_model: |
|
- google-t5/t5-base |
|
pipeline_tag: text2text-generation |
|
library_name: transformers |
|
--- |
|
|
|
This model is for typos in texts and it outputs corrected texts. |
|
|
|
Example: |
|
|
|
Text with Typos: **Whathvhr wh call owr carhaivhrs - doctors, nwrsh practitionhrs, clinicians, - wh nhhd thhm not only to carh, wh nhhd thhm to uh aulh to providh thh riaht valwh.** |
|
|
|
Corrected Text: **Whatever we call our caregivers - doctors, nurse practitioners, clinicians, - we need them not only to care, we need them to be able to provide the right value.** |
|
|
|
|
|
Example Usage: |
|
```py |
|
#Load the model and tokenizer |
|
text = "" #Text with typos here! |
|
inputs = tokenizer(cipher_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) |
|
outputs = model.generate(inputs["input_ids"], max_length=256) |
|
corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
``` |
|
|
|
|
|
Full Pipeline Usage: |
|
```py |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
import torch |
|
from string import ascii_lowercase |
|
import Levenshtein |
|
import random |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng") |
|
alphabet_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/Substitution-Cipher-Alphabet-Eng").to(device) |
|
correction_model = AutoModelForSeq2SeqLM.from_pretrained("Cipher-AI/AutoCorrect-EN-v2").to(device) |
|
|
|
def similarity_percentage(s1, s2): |
|
distance = Levenshtein.distance(s1, s2) |
|
|
|
max_len = max(len(s1), len(s2)) |
|
|
|
similarity = (1 - distance / max_len) * 100 |
|
|
|
return similarity |
|
|
|
def decode(cipher_text, key): |
|
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[:26])} |
|
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[:26])}) |
|
ans = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text)) |
|
return ans |
|
|
|
def model_pass(model, input, max_length=256): |
|
inputs = tokenizer(input, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device) |
|
outputs = model.generate(inputs["input_ids"], max_length=max_length) |
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
return result |
|
|
|
def decipher(cipher_text, key) -> str: |
|
decipher_map = {ascii_lowercase[i]: j for i, j in enumerate(key[0])} |
|
decipher_map.update({ascii_lowercase[i].upper(): j.upper() for i, j in enumerate(key[0])}) |
|
|
|
result = ''.join(map(lambda x: decipher_map[x] if x in decipher_map else x, cipher_text[0])) |
|
|
|
return result |
|
|
|
def cipher(plain_text) -> tuple[str, list]: |
|
alphabet_map = list(ascii_lowercase) |
|
random.shuffle(alphabet_map) |
|
alphabet_map = {i : j for i, j in zip(ascii_lowercase, alphabet_map)} |
|
|
|
alphabet_map.update({i.upper() : j.upper() for i, j in alphabet_map.items()}) |
|
|
|
cipher_text = ''.join(map(lambda x: alphabet_map[x] if x in alphabet_map else x, plain_text)) |
|
return cipher_text, alphabet_map |
|
|
|
def correct_text(cipher_text, model_output): |
|
cipher_text = cipher_text.split(' ') |
|
model_output = model_output.split(' ') |
|
|
|
letter_map = {i: {j: 0 for j in ascii_lowercase} for i in ascii_lowercase} |
|
|
|
|
|
# Levenstein distance for lenghts of words |
|
n = len(cipher_text) |
|
m = len(model_output) |
|
|
|
i = 0 |
|
j = 0 |
|
dp = [[0 for _ in range(m + 1)] for _ in range(n + 1)] |
|
|
|
for i in range(n + 1): |
|
dp[i][0] = i |
|
|
|
|
|
for j in range(m + 1): |
|
dp[0][j] = j |
|
|
|
for i in range(1, n + 1): |
|
for j in range(1, m + 1): |
|
if len(cipher_text[i - 1]) == len(model_output[j - 1]): |
|
dp[i][j] = dp[i - 1][j - 1] |
|
|
|
else: |
|
dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1 |
|
|
|
i = n |
|
j = m |
|
while i > 0 and j > 0: |
|
|
|
before = min([(0, dp[i - 1][j - 1]), (1, dp[i - 1][j]), (2, dp[i][j - 1])], key=lambda x: x[1]) |
|
match before[0]: |
|
case 0: |
|
if dp[i - 1][j - 1] == dp[i][j]: |
|
# If the same we add them to letter map |
|
cipher = cipher_text[i-1] |
|
model_o = model_output[j-1] |
|
|
|
for c_letter, m_letter in zip(cipher.lower(), model_o.lower()): |
|
if c_letter in letter_map and m_letter in letter_map[c_letter]: |
|
letter_map[c_letter][m_letter] += 1 |
|
|
|
i = i - 1 |
|
j = j - 1 |
|
case 1: |
|
i = i - 1 |
|
case 2: |
|
j = j - 1 |
|
|
|
for letter in ascii_lowercase: |
|
letter_sum = sum(letter_map[letter].values()) |
|
if letter_sum == 0: |
|
# That letter wasn't in the text |
|
letter_map[letter] = None |
|
continue |
|
|
|
# Sorted from most accuring to least |
|
letter_map[letter] = [(k, v / letter_sum) for k, v in sorted(letter_map[letter].items(), key=lambda item: item[1], reverse=True)] |
|
|
|
change_map = { |
|
i : None for i in ascii_lowercase |
|
} |
|
|
|
for i in range(len(ascii_lowercase)): |
|
for letter in ascii_lowercase: |
|
if letter_map[letter] is None: |
|
continue # That letter wasn't in the text |
|
|
|
# If None then it didn't get substituted earlier |
|
map_letter = letter_map[letter][i][0] |
|
if (letter_map[letter][i][1] > 0 and (change_map[map_letter] is None |
|
or (change_map[map_letter][2] < letter_map[letter][i][1] and change_map[map_letter][1] >= i))): |
|
change_map[map_letter] = (letter, i, letter_map[letter][i][1]) |
|
# Letter, iteration, percentage |
|
|
|
change_map = {i[1][0]: i[0] for i in change_map.items() if i[1] is not None} |
|
|
|
for letter in ascii_lowercase: |
|
if letter not in change_map: |
|
change_map[letter] = '.' |
|
|
|
|
|
# Add uppercases |
|
change_map.update( |
|
{ |
|
i[0].upper() : i[1].upper() for i in change_map.items() |
|
} |
|
) |
|
|
|
new_text = [] |
|
for cipher in cipher_text: |
|
new_word = "" |
|
for c_letter in cipher: |
|
if c_letter in change_map: |
|
new_word += change_map[c_letter] |
|
|
|
else: |
|
new_word += c_letter |
|
|
|
|
|
new_text.append(new_word) |
|
|
|
return ' '.join(new_text) |
|
|
|
def crack_sub(cipher_text): |
|
output = model_pass(alphabet_model, cipher_text, 26) |
|
decoded = decode(cipher_text, output) |
|
second_pass = model_pass(correction_model, decoded, len(decoded)) |
|
second_text = correct_text(cipher_text, second_pass) |
|
third_pass = model_pass(correction_model, second_text, len(decoded)) |
|
|
|
return third_pass |
|
|
|
""" |
|
Use crack_sub() function to solve monoalphabetic substitution ciphers! |
|
""" |
|
``` |