mohamedrady commited on
Commit
8af0561
·
verified ·
1 Parent(s): f9953aa

Delete firstkha.py

Browse files
Files changed (1) hide show
  1. firstkha.py +0 -231
firstkha.py DELETED
@@ -1,231 +0,0 @@
1
- import os
2
- import re
3
- import torch
4
- from collections import Counter
5
- from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForTokenClassification
6
- import PyPDF2
7
- import openai
8
- import docx
9
-
10
-
11
- # التحقق من توفر GPU واستخدامه
12
- device = 0 if torch.cuda.is_available() else -1
13
-
14
- # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
15
- arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
16
- arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
17
-
18
- arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
19
- arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base")
20
-
21
- arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator")
22
- arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator")
23
-
24
- arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
25
- arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
26
-
27
- # تحميل نموذج التعرف على الكيانات المسماة من CAMeL-Lab
28
- ner_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
29
- ner_model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
30
- nlp_ner = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
31
-
32
- # دالة لتحليل النص باستخدام transformers
33
- def camel_ner_analysis(text):
34
- ner_results = nlp_ner(text)
35
- entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
36
- for entity in ner_results:
37
- entity_type = entity["entity"]
38
- if entity_type in entity_dict:
39
- entity_dict[entity_type].append((entity["word"], entity_type))
40
- return entity_dict
41
-
42
- # دالة لتقسيم النص إلى أجزاء بناءً على عدد التوكنز
43
- def split_text_into_chunks(text, tokenizer, max_length):
44
- tokens = tokenizer.tokenize(text)
45
- chunks = []
46
- for i in range(0, len(tokens), max_length):
47
- chunk_tokens = tokens[i:i + max_length]
48
- chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
49
- chunks.append(chunk_text)
50
- return chunks
51
-
52
- # دالة لتجزئة النص إلى جمل باستخدام التعبيرات العادية
53
- def extract_sentences(text):
54
- sentences = re.split(r'(?<=[.!؟]) +', text)
55
- return sentences
56
-
57
- # دالة لاستخراج الاقتباسات من النص
58
- def extract_quotes(text):
59
- quotes = re.findall(r'[“"«](.*?)[”"»]', text)
60
- return quotes
61
-
62
- # دالة لعد الرموز في النص
63
- def count_tokens(text, tokenizer):
64
- tokens = tokenizer.tokenize(text)
65
- return len(tokens)
66
-
67
- # دالة لاستخراج النص من ملفات PDF
68
- def extract_pdf_text(file_path):
69
- with open(file_path, "rb") as pdf_file:
70
- pdf_reader = PyPDF2.PdfReader(pdf_file)
71
- text = ""
72
- for page_num in range(len(pdf_reader.pages)):
73
- page = pdf_reader.pages[page_num]
74
- text += page.extract_text()
75
- return text
76
-
77
- # دالة لاستخراج النص من ملفات DOCX
78
- def extract_docx_text(file_path):
79
- doc = docx.Document(file_path)
80
- text = "\n".join([para.text for para in doc.paragraphs])
81
- return text
82
-
83
- # دالة لقراءة النص من ملف مع التعامل مع مشاكل الترميز
84
- def read_text_file(file_path):
85
- try:
86
- with open(file_path, "r", encoding="utf-8") as file:
87
- return file.read()
88
- except UnicodeDecodeError:
89
- try:
90
- with open(file_path, "r", encoding="latin-1") as file:
91
- return file.read()
92
- except UnicodeDecodeError:
93
- with open(file_path, "r", encoding="cp1252") as file:
94
- return file.read()
95
-
96
- # دالة لاستخراج المشاهد من النص
97
- def extract_scenes(text):
98
- scenes = re.split(r'داخلي|خارجي', text)
99
- scenes = [scene.strip() for scene in scenes if scene.strip()]
100
- return scenes
101
-
102
- # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
103
- def extract_scene_details(scene):
104
- details = {}
105
- location_match = re.search(r'(داخلي|خارجي)', scene)
106
- time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
107
-
108
- if location_match:
109
- details['location'] = location_match.group()
110
- if time_match:
111
- details['time'] = time_match.group()
112
-
113
- return details
114
-
115
- # دالة لاستخراج أعمار الشخصيات
116
- def extract_ages(text):
117
- ages = re.findall(r'\b(\d{1,2})\s*(?:عام|سنة|سنوات)\s*(?:من العمر|عمره|عمرها)', text)
118
- return ages
119
-
120
- # دالة لاستخراج وصف الشخصيات
121
- def extract_character_descriptions(text):
122
- descriptions = re.findall(r'شخصية\s*(.*?)\s*:\s*وصف\s*(.*?)\s*(?:\.|،)', text, re.DOTALL)
123
- return descriptions
124
-
125
- # دالة لاستخراج تكرار الشخصيات
126
- def extract_character_frequency(entities):
127
- persons = [ent[0] for ent in entities['PERSON']]
128
- frequency = Counter(persons)
129
- return frequency
130
-
131
- # دالة لاستخراج الحوارات وتحديد المتحدثين
132
- def extract_dialogues(text):
133
- dialogues = re.findall(r'(.*?)(?:\s*:\s*)(.*?)(?=\n|$)', text, re.DOTALL)
134
- return dialogues
135
-
136
- # دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
137
- def process_files(input_directory, output_directory_500):
138
- for file_name in os.listdir(input_directory):
139
- file_path = os.path.join(input_directory, file_name)
140
-
141
- if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
142
- continue
143
-
144
- if file_path.endswith(".pdf"):
145
- text = extract_pdf_text(file_path)
146
- elif file_path.endswith(".docx"):
147
- text = extract_docx_text(file_path)
148
- else:
149
- text = read_text_file(file_path)
150
-
151
- # تقسيم النص إلى أجزاء لا تتجاوز 450 توكنز
152
- chunks_450 = split_text_into_chunks(text, arabic_bert_tokenizer, 450)
153
- for i, chunk in enumerate(chunks_450):
154
- output_file_450 = os.path.join(output_directory_500, f"{os.path.splitext(file_name)[0]}_part_{i+1}.txt")
155
- with open(output_file_450, "w", encoding="utf-8") as file:
156
- file.write(chunk)
157
-
158
- # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
159
- def analyze_files(input_directory, output_directory, tokenizer, max_length):
160
- for file_name in os.listdir(input_directory):
161
- file_path = os.path.join(input_directory, file_name)
162
-
163
- if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
164
- continue
165
-
166
- with open(file_path, "r", encoding="utf-8") as file:
167
- text = file.read()
168
-
169
- chunks = split_text_into_chunks(text, tokenizer, max_length)
170
-
171
- # إجراء التحليل على النصوص المقسمة
172
- for chunk in chunks:
173
- entities = camel_ner_analysis(chunk)
174
- sentences = extract_sentences(chunk)
175
- quotes = extract_quotes(chunk)
176
- token_count = count_tokens(chunk, tokenizer)
177
- scenes = extract_scenes(chunk)
178
- ages = extract_ages(chunk)
179
- character_descriptions = extract_character_descriptions(chunk)
180
- character_frequency = extract_character_frequency(entities)
181
- dialogues = extract_dialogues(chunk)
182
- scene_details = [extract_scene_details(scene) for scene in scenes]
183
-
184
- # حفظ النتائج
185
- with open(os.path.join(output_directory, f"{file_name}_entities.txt"), "a", encoding="utf-8") as file:
186
- file.write(str(entities))
187
-
188
- with open(os.path.join(output_directory, f"{file_name}_sentences.txt"), "a", encoding="utf-8") as file:
189
- file.write("\n".join(sentences))
190
-
191
- with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
192
- file.write("\n".join(quotes))
193
-
194
- with open(os.path.join(output_directory, f"{file_name}_token_count.txt"), "a", encoding="utf-8") as file:
195
- file.write(str(token_count))
196
-
197
- with open(os.path.join(output_directory, f"{file_name}_scenes.txt"), "a", encoding="utf-8") as file:
198
- file.write("\n".join(scenes))
199
-
200
- with open(os.path.join(output_directory, f"{file_name}_scene_details.txt"), "a", encoding="utf-8") as file:
201
- file.write(str(scene_details))
202
-
203
- with open(os.path.join(output_directory, f"{file_name}_ages.txt"), "a", encoding="utf-8") as file:
204
- file.write(str(ages))
205
-
206
- with open(os.path.join(output_directory, f"{file_name}_character_descriptions.txt"), "a", encoding="utf-8") as file:
207
- file.write(str(character_descriptions))
208
-
209
- with open(os.path.join(output_directory, f"{file_name}_character_frequency.txt"), "a", encoding="utf-8") as file:
210
- file.write(str(character_frequency))
211
-
212
- with open(os.path.join(output_directory, f"{file_name}_dialogues.txt"), "a", encoding="utf-8") as file:
213
- file.write(str(dialogues))
214
-
215
- # تحديد المسارات
216
- input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
217
- output_directory_450 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/500"
218
- input_directory_450 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/500"
219
- output_directory_450_out = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out/500"
220
-
221
- # التأكد من وجود المسارات
222
- os.makedirs(output_directory_450, exist_ok=True)
223
- os.makedirs(output_directory_450_out, exist_ok=True)
224
-
225
- # معالجة الملفات وتقسيمها
226
- process_files(input_directory, output_directory_450)
227
-
228
- # تحليل الملفات المقسمة إلى 450 توكنز
229
- analyze_files(input_directory_450, output_directory_450_out, arabic_bert_tokenizer, 512)
230
-
231
- print("تمت معالجة الملفات وتحليلها بنجاح.")