mohamedabdullah's picture
history blame
11 kB
import gradio as gr
from datasets import load_dataset
import re
import numpy as np
dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt")
word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
vocab = set(word_l)
def delete_letter(word):
return [word[:i]+word[i+1:] for i in range(len(word))]
def switch_letter(word):
switch_l = []
for i in range(len(word)-1):
w_l = re.findall('\w', word)
if i-1 < 0:
w_l[i:i+2] = w_l[i+1::-1]
w_l[i:i+2] = w_l[i+1:i-1:-1]
return switch_l
def replace_letter(word):
letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
replace_set = set()
for i in range(len(word)):
for l in letters:
new_word = word[:i]+l+word[i+1:]
if new_word == word:
replace_l = sorted(list(replace_set))
return replace_l
def insert_letter(word):
letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
insert_l = []
for i in range(len(word)+1):
for l in letters:
new_word = word[:i]+l+word[i:]
return insert_l
def edit_one_letter(word, allow_switches = True):
edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)
if allow_switches:
edit_one_set += switch_letter(word)
return set(edit_one_set)
def edit_two_letters(word, allow_switches = True):
edit_two_set = []
edit_one_set = edit_one_letter(word)
for edit in edit_one_set:
edit_two_set += edit_one_letter(edit)
return set(edit_two_set) | set(edit_one_set)
def get_corrections(word, vocab):
suggestions = []
correct_word_suggest = [word] if word in vocab else []
edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))
suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest
return set(suggestions)
def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
m = len(source)
n = len(target)
D = np.zeros((m+1, n+1), dtype=int)
for row in range(1, m+1):
D[row,0] = D[row-1,0]+del_cost
for col in range(1, n+1):
D[0,col] = D[0, col-1]+ins_cost
for row in range(1, m+1):
for col in range(1, n+1):
r_cost = rep_cost
if source[row-1] == target[col-1]:
r_cost = 0
D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])
med = D[m,n]
return med
def get_suggestions(corrections, word):
distance = []
suggest = []
for correction in corrections:
source = word
target = correction
min_edits = min_edit_distance(source, target)
suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance)))
return suggest_result
def ar_spelling_checker(text):
word_l = re.findall('\w{3,}', text)
result = {}
for word in word_l:
if not word in vocab:
tmp_corrections = get_corrections(word, vocab)
if len(tmp_corrections) == 0:
result[word] = get_suggestions(tmp_corrections, word)
output = '''<style>
direction: rtl;
color: #842029;
background-color: #f8d7da;
border-color: #f5c2c7;
padding: 10px 20px;
display: inline-block;
direction: rtl;
font-size: 15px;
font-weight: 500;
margin-bottom: 15px;
box-sizing: border-box;
border: 1px solid transparent;
border-radius: 0.25rem;
color: #0f5132;
background-color: #d1e7dd;
border-color: #badbcc;
display: inline-block;
margin-right: 5px;
background: #CCC;
margin-bottom: 15px;
color: #0f5132;
background-color: #d1e7dd;
border-color: #badbcc;
border: 1px solid transparent;
border-radius: 0.25rem;
padding: 15px 20px;
direction: rtl;
font-size: 20px;
font-weight: 500;
text-align: center;
output += '<div class="content">'
if len(result.keys()) == 0:
output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'
for word in result.keys():
output += f'<div class="word">{word}</div><br />'
for suggest in result[word]:
output += f'<div class="word suggest">{suggest}</div>'
output += '<div class="separator"></div>'
output += '</div>'
return output
with gr.Blocks(css="""
#input{direction: rtl;}
#component-112{height: 30px;}
.gr-form{margin-top: 15px;}
.gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;}
.text-gray-500{font-size: 16px; margin-bottom: 13px;}
.gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
border: 1px solid transparent; border-radius: 0.25rem;
padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
.output-html{min-height: 2rem;}
.title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
line-height: 1.5;font-family: 'IBM Plex Mono';}
.desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:
intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
text = gr.Textbox(label="النص", elem_id="input")
btn = gr.Button("Spelling Check")
output = gr.HTML(), [text], output)