SuperSl6 commited on
Commit
0c64ab6
·
verified ·
1 Parent(s): 5bd61d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -32
app.py CHANGED
@@ -1,9 +1,8 @@
1
  from transformers import pipeline, AutoTokenizer
2
  import gradio as gr
3
- import re
4
  import difflib
5
 
6
- # Load tokenizer with use_fast=False
7
  tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
8
  model = pipeline(
9
  "text2text-generation",
@@ -11,34 +10,47 @@ model = pipeline(
11
  tokenizer=tokenizer
12
  )
13
 
14
- def extract_corrected_version(original, generated):
15
- # Split generated text into sentences
16
- sentences = generated.split(' . ')
17
 
18
- # Find the sentence most similar to the original
19
- best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
 
20
 
21
- # Extract the corrected Arabic words
22
- corrected_words = re.findall(r'[\u0600-\u06FF]+', best_match)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # If no corrections found, return the original input
25
- if not corrected_words:
26
- return original
 
27
 
28
- # Check if the corrected text is a proper subset of the generated text
29
- corrected_text = ' '.join(corrected_words)
30
- if corrected_text in best_match:
31
- # Check if the corrected text is the complete output
32
- if corrected_text == best_match.strip():
33
- return corrected_text
34
- else:
35
- # If not the complete output, find the shortest corrected phrase
36
- for i in range(len(corrected_words), 0, -1):
37
- phrase = ' '.join(corrected_words[:i])
38
- if phrase in best_match:
39
- return phrase
40
- # If no corrected phrase is found, return the original input
41
- return original
42
 
43
  def correct_text(input_text):
44
  result = model(
@@ -52,18 +64,27 @@ def correct_text(input_text):
52
  do_sample=True
53
  )[0]['generated_text']
54
 
55
- # Extract the corrected version
56
  corrected_text = extract_corrected_version(input_text, result)
57
  return corrected_text
58
 
59
  # Gradio Interface
 
 
 
 
 
 
 
60
  interface = gr.Interface(
61
  fn=correct_text,
62
- inputs=gr.Textbox(lines=3, placeholder="أدخل النص العربي هنا..."),
63
- outputs=gr.Textbox(),
64
  live=True,
65
- title="تصحيح النص العربي",
66
- description="أداة لتصحيح النصوص العربية باستخدام نموذج SuperSl6/Arabic-Text-Correction."
 
 
 
67
  )
68
 
69
- interface.launch()
 
1
  from transformers import pipeline, AutoTokenizer
2
  import gradio as gr
 
3
  import difflib
4
 
5
+ # Load tokenizer
6
  tokenizer = AutoTokenizer.from_pretrained("SuperSl6/Arabic-Text-Correction", use_fast=False)
7
  model = pipeline(
8
  "text2text-generation",
 
10
  tokenizer=tokenizer
11
  )
12
 
13
+ def align_and_preserve(original, corrected):
14
+ original_words = original.split()
15
+ corrected_words = corrected.split()
16
 
17
+ matcher = difflib.SequenceMatcher(None, original_words, corrected_words)
18
+ final_output = []
19
+ seen_words = set()
20
 
21
+ for opcode, a0, a1, b0, b1 in matcher.get_opcodes():
22
+ if opcode == 'equal':
23
+ for word in corrected_words[b0:b1]:
24
+ if word not in seen_words:
25
+ final_output.append(word)
26
+ seen_words.add(word)
27
+ elif opcode == 'delete':
28
+ for word in original_words[a0:a1]:
29
+ if word not in seen_words:
30
+ final_output.append(word)
31
+ seen_words.add(word)
32
+ elif opcode == 'replace':
33
+ for word in corrected_words[b0:b1]:
34
+ if word not in seen_words:
35
+ final_output.append(word)
36
+ seen_words.add(word)
37
+ for word in original_words[a0:a1]:
38
+ if word not in seen_words:
39
+ final_output.append(word)
40
+ seen_words.add(word)
41
 
42
+ for word in corrected_words[b1:]:
43
+ if word not in seen_words:
44
+ final_output.append(word)
45
+ seen_words.add(word)
46
 
47
+ return ' '.join(final_output)
48
+
49
+ def extract_corrected_version(original, generated):
50
+ sentences = generated.split(' . ')
51
+ best_match = max(sentences, key=lambda s: difflib.SequenceMatcher(None, original, s).ratio())
52
+ corrected_text = align_and_preserve(original, best_match.strip())
53
+ return corrected_text
 
 
 
 
 
 
 
54
 
55
  def correct_text(input_text):
56
  result = model(
 
64
  do_sample=True
65
  )[0]['generated_text']
66
 
 
67
  corrected_text = extract_corrected_version(input_text, result)
68
  return corrected_text
69
 
70
  # Gradio Interface
71
+ examples = [
72
+ ["اكيد ان لحكام العرب والمسلمين مسؤولية يتمثل ادناها في استدعاء السفراء في الصين للتشاور"],
73
+ ["هزا النص يحتوي على الكثير من الاخطاء الاملائية"],
74
+ ["هليكم السلام ورحمة الله وبركاته"],
75
+ ["انشاء الله سيكون كل شيء بخير"]
76
+ ]
77
+
78
  interface = gr.Interface(
79
  fn=correct_text,
80
+ inputs=gr.Textbox(lines=4, placeholder="✍️ أدخل النص العربي هنا لتصحيحه...", label="📥 النص المدخل"),
81
+ outputs=gr.Textbox(label="✅ النص المصحح"),
82
  live=True,
83
+ title="🚀 تصحيح النص العربي باستخدام SuperSl6/Arabic-Text-Correction",
84
+ description="📝 أداة ذكية لتصحيح النصوص العربية باستخدام تقنيات الذكاء الاصطناعي. أدخل النص وسيتم تصحيحه في الوقت الفعلي!",
85
+ theme="compact",
86
+ examples=examples,
87
+ allow_flagging="never"
88
  )
89
 
90
+ interface.launch()