import os import re from camel_tools.tokenizers.word import simple_word_tokenize from camel_tools.ner import NERecognizer import nltk import torch from collections import Counter from transformers import pipeline, AutoModel, AutoTokenizer import PyPDF2 import gradio as gr import openai # تعيين التوكن الخاص بـ OpenAI openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU" # تحميل وتفعيل الأدوات المطلوبة nltk.download('punkt') # التحقق من توفر GPU واستخدامه device = 0 if torch.cuda.is_available() else -1 # تحميل نماذج التحليل اللغوي analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device) # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic") arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic") arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base") arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base") arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator") arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator") arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02") arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02") # دالة لتحليل النص باستخدام camel_tools def camel_ner_analysis(text): ner = NERecognizer.pretrained() tokens = simple_word_tokenize(text) entities = ner.predict(tokens) entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []} for token, tag in zip(tokens, entities): if tag in entity_dict: entity_dict[tag].append((token, tag)) return entity_dict # دالة لتحليل المشاعر def analyze_sentiments(text): sentiments = analyzer(text) return sentiments # دالة لتجزئة النص إلى جمل def nltk_extract_sentences(text): sentences = nltk.tokenize.sent_tokenize(text, language='arabic') return sentences # دالة لاستخراج الاقتباسات من النص def nltk_extract_quotes(text): quotes = [] sentences = nltk.tokenize.sent_tokenize(text, language='arabic') for sentence in sentences: if '"' in sentence or '«' in sentence or '»' in sentence: quotes.append(sentence) return quotes # دالة لعد الرموز في النص def count_tokens(text): tokens = simple_word_tokenize(text) return len(tokens) # دالة لاستخراج النص من ملفات PDF def extract_pdf_text(file_path): with open(file_path, "rb") as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() return text # دالة لاستخراج المشاهد من النص def extract_scenes(text): scenes = re.split(r'داخلي|خارجي', text) scenes = [scene.strip() for scene in scenes if scene.strip()] return scenes # دالة لاستخراج تفاصيل المشهد (المكان والوقت) def extract_scene_details(scene): details = {} location_match = re.search(r'(داخلي|خارجي)', scene) time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene) if location_match: details['location'] = location_match.group() if time_match: details['time'] = time_match.group() return details # دالة لاستخراج أعمار الشخصيات def extract_ages(text): ages = re.findall(r'\b(\d{1,2})\s*(?:عام|سنة|سنوات)\s*(?:من العمر|عمره|عمرها)', text) return ages # دالة لاستخراج وصف الشخصيات def extract_character_descriptions(text): descriptions = re.findall(r'شخصية\s*(.*?)\s*:\s*وصف\s*(.*?)\s*(?:\.|،)', text, re.DOTALL) return descriptions # دالة لاستخراج تكرار الشخصيات def extract_character_frequency(entities): persons = [ent[0] for ent in entities['PERSON']] frequency = Counter(persons) return frequency # دالة لاستخراج الحوارات وتحديد المتحدثين def extract_dialogues(text): dialogues = re.findall(r'(.*?)(?:\s*:\s*)(.*?)(?=\n|$)', text, re.DOTALL) return dialogues # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج def analyze_and_complete(file_paths): results = [] output_directory = os.getenv("SPACE_DIR", "/app/output") for file_path in file_paths: if file_path.endswith(".pdf"): text = extract_pdf_text(file_path) else: with open(file_path, "r", encoding="utf-8") as file: text = file.read() filename_prefix = os.path.splitext(os.path.basename(file_path))[0] camel_entities = camel_ner_analysis(text) sentiments = analyze_sentiments(text) sentences = nltk_extract_sentences(text) quotes = nltk_extract_quotes(text) token_count = count_tokens(text) scenes = extract_scenes(text) ages = extract_ages(text) character_descriptions = extract_character_descriptions(text) character_frequency = extract_character_frequency(camel_entities) dialogues = extract_dialogues(text) scene_details = [extract_scene_details(scene) for scene in scenes] # حفظ النتائج إلى ملفات with open(os.path.join(output_directory, f"{filename_prefix}_entities.txt"), "w", encoding="utf-8") as file: file.write(str(camel_entities)) with open(os.path.join(output_directory, f"{filename_prefix}_sentiments.txt"), "w", encoding="utf-8") as file: file.write(str(sentiments)) with open(os.path.join(output_directory, f"{filename_prefix}_sentences.txt"), "w", encoding="utf-8") as file: file.write("\n".join(sentences)) with open(os.path.join(output_directory, f"{filename_prefix}_quotes.txt"), "w", encoding="utf-8") as file: file.write("\n".join(quotes)) with open(os.path.join(output_directory, f"{filename_prefix}_token_count.txt"), "w", encoding="utf-8") as file: file.write(str(token_count)) with open(os.path.join(output_directory, f"{filename_prefix}_scenes.txt"), "w", encoding="utf-8") as file: file.write("\n".join(scenes)) with open(os.path.join(output_directory, f"{filename_prefix}_scene_details.txt"), "w", encoding="utf-8") as file: file.write(str(scene_details)) with open(os.path.join(output_directory, f"{filename_prefix}_ages.txt"), "w", encoding="utf-8") as file: file.write(str(ages)) with open(os.path.join(output_directory, f"{filename_prefix}_character_descriptions.txt"), "w", encoding="utf-8") as file: file.write(str(character_descriptions)) with open(os.path.join(output_directory, f"{filename_prefix}_character_frequency.txt"), "w", encoding="utf-8") as file: file.write(str(character_frequency)) with open(os.path.join(output_directory, f"{filename_prefix}_dialogues.txt"), "w", encoding="utf-8") as file: file.write(str(dialogues)) results.append((str(camel_entities), str(sentiments), "\n".join(sentences), "\n".join(quotes), str(token_count), "\n".join(scenes), str(scene_details), str(ages), str(character_descriptions), str(character_frequency), str(dialogues))) return results ## تعريف واجهة Gradio interface = gr.Interface( fn=analyze_and_complete, inputs=gr.File(file_count="multiple", type="filepath"), outputs=gr.JSON(), title="Movie Script Analyzer and Completer", description="Upload text, PDF, or DOCX files to analyze and complete the movie script." ) if __name__ == "__main__": interface.launch()