Runtime error
Runtime error
Browse files
@@ -1,24 +1,223 @@
1 |
import gradio as gr
2 |
import subprocess
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
if __name__ == "__main__":
24 |
1 |
import os
2 |
import re
3 |
import torch
4 |
from collections import Counter
5 |
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForTokenClassification, AutoModelForCausalLM
6 |
import PyPDF2
7 |
import openai
8 |
import docx
9 |
from arabert.preprocess import ArabertPreprocessor
10 |
import gradio as gr
11 |
12 |
13 |
# التحقق من توفر GPU واستخدامه
14 |
device = 0 if torch.cuda.is_available() else -1
15 |
16 |
# تحميل نماذج BERT و GPT2
17 |
arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
18 |
arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
19 |
20 |
arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
21 |
arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
22 |
23 |
gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-large", trust_remote_code=True)
24 |
gpt2_model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-large", trust_remote_code=True)
25 |
26 |
# إعداد المعالج النصي لـ AraBERT
27 |
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv02")
28 |
29 |
# دالة لتقسيم النص إلى أجزاء بناءً على عدد التوكنز
30 |
def split_text_into_chunks(text, tokenizer, max_length):
31 |
tokens = tokenizer.tokenize(text)
32 |
chunks = []
33 |
for i in range(0, len(tokens), max_length):
34 |
chunk_tokens = tokens[i:i + max_length]
35 |
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
36 |
37 |
return chunks
38 |
39 |
# دالة لتجزئة النص إلى جمل باستخدام التعبيرات العادية
40 |
def extract_sentences(text):
41 |
sentences = re.split(r'(?<=[.!؟]) +', text)
42 |
return sentences
43 |
44 |
# دالة لاستخراج الاقتباسات من النص
45 |
def extract_quotes(text):
46 |
quotes = re.findall(r'[“"«](.*?)[”"»]', text)
47 |
return quotes
48 |
49 |
# دالة لعد الرموز في النص
50 |
def count_tokens(text, tokenizer):
51 |
tokens = tokenizer.tokenize(text)
52 |
return len(tokens)
53 |
54 |
# دالة لاستخراج النص من ملفات PDF
55 |
def extract_pdf_text(file_path):
56 |
with open(file_path, "rb") as pdf_file:
57 |
pdf_reader = PyPDF2.PdfReader(pdf_file)
58 |
text = ""
59 |
for page_num in range(len(pdf_reader.pages)):
60 |
page = pdf_reader.pages[page_num]
61 |
text += page.extract_text()
62 |
return text
63 |
64 |
# دالة لاستخراج النص من ملفات DOCX
65 |
def extract_docx_text(file_path):
66 |
doc = docx.Document(file_path)
67 |
text = "\n".join([para.text for para in doc.paragraphs])
68 |
return text
69 |
70 |
# دالة لقراءة النص من ملف مع التعامل مع مشاكل الترميز
71 |
def read_text_file(file_path):
72 |
73 |
with open(file_path, "r", encoding="utf-8") as file:
74 |
75 |
except UnicodeDecodeError:
76 |
77 |
with open(file_path, "r", encoding="latin-1") as file:
78 |
79 |
except UnicodeDecodeError:
80 |
with open(file_path, "r", encoding="cp1252") as file:
81 |
82 |
83 |
# دالة لاستخراج المشاهد من النص
84 |
def extract_scenes(text):
85 |
scenes = re.split(r'داخلي|خارجي', text)
86 |
scenes = [scene.strip() for scene in scenes if scene.strip()]
87 |
return scenes
88 |
89 |
# دالة لاستخراج تفاصيل المشهد (المكان والوقت)
90 |
def extract_scene_details(scene):
91 |
details = {}
92 |
location_match ='(داخلي|خارجي)', scene)
93 |
time_match ='(ليلاً|نهاراً|شروق|غروب)', scene)
94 |
95 |
if location_match:
96 |
details['location'] =
97 |
if time_match:
98 |
details['time'] =
99 |
100 |
return details
101 |
102 |
# دالة لاستخراج أعمار الشخصيات
103 |
def extract_ages(text):
104 |
ages = re.findall(r'\b(\d{1,2})\s*(?:عام|سنة|سنوات)\s*(?:من العمر|عمره|عمرها)', text)
105 |
return ages
106 |
107 |
# دالة لاستخراج وصف الشخصيات
108 |
def extract_character_descriptions(text):
109 |
descriptions = re.findall(r'شخصية\s*(.*?)\s*:\s*وصف\s*(.*?)\s*(?:\.|،)', text, re.DOTALL)
110 |
return descriptions
111 |
112 |
# دالة لاستخراج تكرار الشخصيات
113 |
def extract_character_frequency(entities):
114 |
persons = [ent[0] for ent in entities['PERSON']]
115 |
frequency = Counter(persons)
116 |
return frequency
117 |
118 |
# دالة لاستخراج الحوارات وتحديد المتحدثين
119 |
def extract_dialogues(text):
120 |
dialogues = re.findall(r'(.*?)(?:\s*:\s*)(.*?)(?=\n|$)', text, re.DOTALL)
121 |
return dialogues
122 |
123 |
# دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
124 |
def process_files(input_directory, output_directory_950):
125 |
for file_name in os.listdir(input_directory):
126 |
file_path = os.path.join(input_directory, file_name)
127 |
128 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
129 |
130 |
131 |
if file_path.endswith(".pdf"):
132 |
text = extract_pdf_text(file_path)
133 |
elif file_path.endswith(".docx"):
134 |
text = extract_docx_text(file_path)
135 |
136 |
text = read_text_file(file_path)
137 |
138 |
# تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
139 |
chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
140 |
for i, chunk in enumerate(chunks_950):
141 |
output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(file_name)[0]}_part_{i+1}.txt")
142 |
with open(output_file_950, "w", encoding="utf-8") as file:
143 |
144 |
145 |
# دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
146 |
def analyze_files(input_directory, output_directory, tokenizer, max_length):
147 |
for file_name in os.listdir(input_directory):
148 |
file_path = os.path.join(input_directory, file_name)
149 |
150 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
151 |
152 |
153 |
with open(file_path, "r", encoding="utf-8") as file:
154 |
text =
155 |
156 |
chunks = split_text_into_chunks(text, tokenizer, max_length)
157 |
158 |
# إجراء التحليل على النصوص المقسمة
159 |
for chunk in chunks:
160 |
sentences = extract_sentences(chunk)
161 |
quotes = extract_quotes(chunk)
162 |
token_count = count_tokens(chunk, tokenizer)
163 |
scenes = extract_scenes(chunk)
164 |
ages = extract_ages(chunk)
165 |
character_descriptions = extract_character_descriptions(chunk)
166 |
dialogues = extract_dialogues(chunk)
167 |
scene_details = [extract_scene_details(scene) for scene in scenes]
168 |
169 |
# حفظ النتائج
170 |
with open(os.path.join(output_directory, f"{file_name}_sentences.txt"), "a", encoding="utf-8") as file:
171 |
172 |
173 |
174 |
with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
175 |
176 |
177 |
with open(os.path.join(output_directory, f"{file_name}_token_count.txt"), "a", encoding="utf-8") as file:
178 |
179 |
180 |
with open(os.path.join(output_directory, f"{file_name}_scenes.txt"), "a", encoding="utf-8") as file:
181 |
182 |
183 |
with open(os.path.join(output_directory, f"{file_name}_scene_details.txt"), "a", encoding="utf-8") as file:
184 |
185 |
186 |
with open(os.path.join(output_directory, f"{file_name}_ages.txt"), "a", encoding="utf-8") as file:
187 |
188 |
189 |
with open(os.path.join(output_directory, f"{file_name}_character_descriptions.txt"), "a", encoding="utf-8") as file:
190 |
191 |
192 |
with open(os.path.join(output_directory, f"{file_name}_dialogues.txt"), "a", encoding="utf-8") as file:
193 |
194 |
195 |
# تحديد المسارات
196 |
input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
197 |
output_directory_950 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/1000"
198 |
input_directory_950 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/1000"
199 |
output_directory_950_out = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out/1000"
200 |
201 |
# التأكد من وجود المسارات
202 |
os.makedirs(output_directory_950, exist_ok=True)
203 |
os.makedirs(output_directory_950_out, exist_ok=True)
204 |
205 |
# معالجة الملفات وتقسيمها
206 |
process_files(input_directory, output_directory_950)
207 |
208 |
# تحليل الملفات المقسمة إلى 950 توكنز
209 |
analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950)
210 |
211 |
print("تمت معالجة الملفات وتحليلها بنجاح.")
212 |
213 |
# تعريف واجهة Gradio
214 |
interface = gr.Interface(
215 |
216 |
inputs=gr.File(file_count="multiple", type="filepath"),
217 |
218 |
title="Movie Script Analyzer and Completer",
219 |
description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
220 |
221 |
222 |
if __name__ == "__main__":
223 |