mohamedrady commited on
Commit
f9953aa
·
verified ·
1 Parent(s): 0fa896d

Upload 2 files

Browse files
Files changed (2) hide show
  1. alf.py +212 -0
  2. firstkha.py +231 -0
alf.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ from collections import Counter
5
+ from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM
6
+ import PyPDF2
7
+ import openai
8
+ import docx
9
+ from arabert.preprocess import ArabertPreprocessor
10
+
11
+
12
+
13
+
14
+ # التحقق من توفر GPU واستخدامه
15
+ device = 0 if torch.cuda.is_available() else -1
16
+
17
+ # تحميل نماذج BERT و GPT2
18
+ arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
19
+ arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
20
+
21
+ arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
22
+ arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
23
+
24
+ gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-large", trust_remote_code=True)
25
+ gpt2_model = AutoModelForCausalLM.from_pretrained("aubmindlab/aragpt2-large", trust_remote_code=True)
26
+
27
+ # إعداد المعالج النصي لـ AraBERT
28
+ arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv02")
29
+
30
+ # دالة لتقسيم النص إلى أجزاء بناءً على عدد التوكنز
31
+ def split_text_into_chunks(text, tokenizer, max_length):
32
+ tokens = tokenizer.tokenize(text)
33
+ chunks = []
34
+ for i in range(0, len(tokens), max_length):
35
+ chunk_tokens = tokens[i:i + max_length]
36
+ chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
37
+ chunks.append(chunk_text)
38
+ return chunks
39
+
40
+ # دالة لتجزئة النص إلى جمل باستخدام التعبيرات العادية
41
+ def extract_sentences(text):
42
+ sentences = re.split(r'(?<=[.!؟]) +', text)
43
+ return sentences
44
+
45
+ # دالة لاستخراج الاقتباسات من النص
46
+ def extract_quotes(text):
47
+ quotes = re.findall(r'[“"«](.*?)[”"»]', text)
48
+ return quotes
49
+
50
+ # دالة لعد الرموز في النص
51
+ def count_tokens(text, tokenizer):
52
+ tokens = tokenizer.tokenize(text)
53
+ return len(tokens)
54
+
55
+ # دالة لاستخراج النص من ملفات PDF
56
+ def extract_pdf_text(file_path):
57
+ with open(file_path, "rb") as pdf_file:
58
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
59
+ text = ""
60
+ for page_num in range(len(pdf_reader.pages)):
61
+ page = pdf_reader.pages[page_num]
62
+ text += page.extract_text()
63
+ return text
64
+
65
+ # دالة لاستخراج النص من ملفات DOCX
66
+ def extract_docx_text(file_path):
67
+ doc = docx.Document(file_path)
68
+ text = "\n".join([para.text for para in doc.paragraphs])
69
+ return text
70
+
71
+ # دالة لقراءة النص من ملف مع التعامل مع مشاكل الترميز
72
+ def read_text_file(file_path):
73
+ try:
74
+ with open(file_path, "r", encoding="utf-8") as file:
75
+ return file.read()
76
+ except UnicodeDecodeError:
77
+ try:
78
+ with open(file_path, "r", encoding="latin-1") as file:
79
+ return file.read()
80
+ except UnicodeDecodeError:
81
+ with open(file_path, "r", encoding="cp1252") as file:
82
+ return file.read()
83
+
84
+ # دالة لاستخراج المشاهد من النص
85
+ def extract_scenes(text):
86
+ scenes = re.split(r'داخلي|خارجي', text)
87
+ scenes = [scene.strip() for scene in scenes if scene.strip()]
88
+ return scenes
89
+
90
+ # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
91
+ def extract_scene_details(scene):
92
+ details = {}
93
+ location_match = re.search(r'(داخلي|خارجي)', scene)
94
+ time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
95
+
96
+ if location_match:
97
+ details['location'] = location_match.group()
98
+ if time_match:
99
+ details['time'] = time_match.group()
100
+
101
+ return details
102
+
103
+ # دالة لاستخراج أعمار الشخصيات
104
+ def extract_ages(text):
105
+ ages = re.findall(r'\b(\d{1,2})\s*(?:عام|سنة|سنوات)\s*(?:من العمر|عمره|عمرها)', text)
106
+ return ages
107
+
108
+ # دالة لاستخراج وصف الشخصيات
109
+ def extract_character_descriptions(text):
110
+ descriptions = re.findall(r'شخصية\s*(.*?)\s*:\s*وصف\s*(.*?)\s*(?:\.|،)', text, re.DOTALL)
111
+ return descriptions
112
+
113
+ # دالة لاستخراج تكرار الشخصيات
114
+ def extract_character_frequency(entities):
115
+ persons = [ent[0] for ent in entities['PERSON']]
116
+ frequency = Counter(persons)
117
+ return frequency
118
+
119
+ # دالة لاستخراج الحوارات وتحديد المتحدثين
120
+ def extract_dialogues(text):
121
+ dialogues = re.findall(r'(.*?)(?:\s*:\s*)(.*?)(?=\n|$)', text, re.DOTALL)
122
+ return dialogues
123
+
124
+ # دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
125
+ def process_files(input_directory, output_directory_950):
126
+ for file_name in os.listdir(input_directory):
127
+ file_path = os.path.join(input_directory, file_name)
128
+
129
+ if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
130
+ continue
131
+
132
+ if file_path.endswith(".pdf"):
133
+ text = extract_pdf_text(file_path)
134
+ elif file_path.endswith(".docx"):
135
+ text = extract_docx_text(file_path)
136
+ else:
137
+ text = read_text_file(file_path)
138
+
139
+ # تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
140
+ chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
141
+ for i, chunk in enumerate(chunks_950):
142
+ output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(file_name)[0]}_part_{i+1}.txt")
143
+ with open(output_file_950, "w", encoding="utf-8") as file:
144
+ file.write(chunk)
145
+
146
+ # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
147
+ def analyze_files(input_directory, output_directory, tokenizer, max_length):
148
+ for file_name in os.listdir(input_directory):
149
+ file_path = os.path.join(input_directory, file_name)
150
+
151
+ if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
152
+ continue
153
+
154
+ with open(file_path, "r", encoding="utf-8") as file:
155
+ text = file.read()
156
+
157
+ chunks = split_text_into_chunks(text, tokenizer, max_length)
158
+
159
+ # إجراء التحليل على النصوص المقسمة
160
+ for chunk in chunks:
161
+ sentences = extract_sentences(chunk)
162
+ quotes = extract_quotes(chunk)
163
+ token_count = count_tokens(chunk, tokenizer)
164
+ scenes = extract_scenes(chunk)
165
+ ages = extract_ages(chunk)
166
+ character_descriptions = extract_character_descriptions(chunk)
167
+ dialogues = extract_dialogues(chunk)
168
+ scene_details = [extract_scene_details(scene) for scene in scenes]
169
+
170
+ # حفظ النتائج
171
+ with open(os.path.join(output_directory, f"{file_name}_sentences.txt"), "a", encoding="utf-8") as file:
172
+ file.write("\n".join(sentences))
173
+
174
+
175
+ with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
176
+ file.write("\n".join(quotes))
177
+
178
+ with open(os.path.join(output_directory, f"{file_name}_token_count.txt"), "a", encoding="utf-8") as file:
179
+ file.write(str(token_count))
180
+
181
+ with open(os.path.join(output_directory, f"{file_name}_scenes.txt"), "a", encoding="utf-8") as file:
182
+ file.write("\n".join(scenes))
183
+
184
+ with open(os.path.join(output_directory, f"{file_name}_scene_details.txt"), "a", encoding="utf-8") as file:
185
+ file.write(str(scene_details))
186
+
187
+ with open(os.path.join(output_directory, f"{file_name}_ages.txt"), "a", encoding="utf-8") as file:
188
+ file.write(str(ages))
189
+
190
+ with open(os.path.join(output_directory, f"{file_name}_character_descriptions.txt"), "a", encoding="utf-8") as file:
191
+ file.write(str(character_descriptions))
192
+
193
+ with open(os.path.join(output_directory, f"{file_name}_dialogues.txt"), "a", encoding="utf-8") as file:
194
+ file.write(str(dialogues))
195
+
196
+ # تحديد المسارات
197
+ input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
198
+ output_directory_950 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/1000"
199
+ input_directory_950 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/1000"
200
+ output_directory_950_out = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out/1000"
201
+
202
+ # التأكد من وجود المسارات
203
+ os.makedirs(output_directory_950, exist_ok=True)
204
+ os.makedirs(output_directory_950_out, exist_ok=True)
205
+
206
+ # معالجة الملفات وتقسيمها
207
+ process_files(input_directory, output_directory_950)
208
+
209
+ # تحليل الملفات المقسمة إلى 950 توكنز
210
+ analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950)
211
+
212
+ print("تمت معالجة الملفات وتحليلها بنجاح.")
firstkha.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import torch
4
+ from collections import Counter
5
+ from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForTokenClassification
6
+ import PyPDF2
7
+ import openai
8
+ import docx
9
+
10
+
11
+ # التحقق من توفر GPU واستخدامه
12
+ device = 0 if torch.cuda.is_available() else -1
13
+
14
+ # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
15
+ arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
16
+ arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
17
+
18
+ arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
19
+ arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base")
20
+
21
+ arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator")
22
+ arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator")
23
+
24
+ arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
25
+ arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
26
+
27
+ # تحميل نموذج التعرف على الكيانات المسماة من CAMeL-Lab
28
+ ner_tokenizer = AutoTokenizer.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
29
+ ner_model = AutoModelForTokenClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-ner")
30
+ nlp_ner = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
31
+
32
+ # دالة لتحليل النص باستخدام transformers
33
+ def camel_ner_analysis(text):
34
+ ner_results = nlp_ner(text)
35
+ entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
36
+ for entity in ner_results:
37
+ entity_type = entity["entity"]
38
+ if entity_type in entity_dict:
39
+ entity_dict[entity_type].append((entity["word"], entity_type))
40
+ return entity_dict
41
+
42
+ # دالة لتقسيم النص إلى أجزاء بناءً على عدد التوكنز
43
+ def split_text_into_chunks(text, tokenizer, max_length):
44
+ tokens = tokenizer.tokenize(text)
45
+ chunks = []
46
+ for i in range(0, len(tokens), max_length):
47
+ chunk_tokens = tokens[i:i + max_length]
48
+ chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
49
+ chunks.append(chunk_text)
50
+ return chunks
51
+
52
+ # دالة لتجزئة النص إلى جمل باستخدام التعبيرات العادية
53
+ def extract_sentences(text):
54
+ sentences = re.split(r'(?<=[.!؟]) +', text)
55
+ return sentences
56
+
57
+ # دالة لاستخراج الاقتباسات من النص
58
+ def extract_quotes(text):
59
+ quotes = re.findall(r'[“"«](.*?)[”"»]', text)
60
+ return quotes
61
+
62
+ # دالة لعد الرموز في النص
63
+ def count_tokens(text, tokenizer):
64
+ tokens = tokenizer.tokenize(text)
65
+ return len(tokens)
66
+
67
+ # دالة لاستخراج النص من ملفات PDF
68
+ def extract_pdf_text(file_path):
69
+ with open(file_path, "rb") as pdf_file:
70
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
71
+ text = ""
72
+ for page_num in range(len(pdf_reader.pages)):
73
+ page = pdf_reader.pages[page_num]
74
+ text += page.extract_text()
75
+ return text
76
+
77
+ # دالة لاستخراج النص من ملفات DOCX
78
+ def extract_docx_text(file_path):
79
+ doc = docx.Document(file_path)
80
+ text = "\n".join([para.text for para in doc.paragraphs])
81
+ return text
82
+
83
+ # دالة لقراءة النص من ملف مع التعامل مع مشاكل الترميز
84
+ def read_text_file(file_path):
85
+ try:
86
+ with open(file_path, "r", encoding="utf-8") as file:
87
+ return file.read()
88
+ except UnicodeDecodeError:
89
+ try:
90
+ with open(file_path, "r", encoding="latin-1") as file:
91
+ return file.read()
92
+ except UnicodeDecodeError:
93
+ with open(file_path, "r", encoding="cp1252") as file:
94
+ return file.read()
95
+
96
+ # دالة لاستخراج المشاهد من النص
97
+ def extract_scenes(text):
98
+ scenes = re.split(r'داخلي|خارجي', text)
99
+ scenes = [scene.strip() for scene in scenes if scene.strip()]
100
+ return scenes
101
+
102
+ # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
103
+ def extract_scene_details(scene):
104
+ details = {}
105
+ location_match = re.search(r'(داخلي|خارجي)', scene)
106
+ time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
107
+
108
+ if location_match:
109
+ details['location'] = location_match.group()
110
+ if time_match:
111
+ details['time'] = time_match.group()
112
+
113
+ return details
114
+
115
+ # دالة لاستخراج أعمار الشخصيات
116
+ def extract_ages(text):
117
+ ages = re.findall(r'\b(\d{1,2})\s*(?:عام|سنة|سنوات)\s*(?:من العمر|عمره|عمرها)', text)
118
+ return ages
119
+
120
+ # دالة لاستخراج وصف الشخصيات
121
+ def extract_character_descriptions(text):
122
+ descriptions = re.findall(r'شخصية\s*(.*?)\s*:\s*وصف\s*(.*?)\s*(?:\.|،)', text, re.DOTALL)
123
+ return descriptions
124
+
125
+ # دالة لاستخراج تكرار الشخصيات
126
+ def extract_character_frequency(entities):
127
+ persons = [ent[0] for ent in entities['PERSON']]
128
+ frequency = Counter(persons)
129
+ return frequency
130
+
131
+ # دالة لاستخراج الحوارات وتحديد المتحدثين
132
+ def extract_dialogues(text):
133
+ dialogues = re.findall(r'(.*?)(?:\s*:\s*)(.*?)(?=\n|$)', text, re.DOTALL)
134
+ return dialogues
135
+
136
+ # دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
137
+ def process_files(input_directory, output_directory_500):
138
+ for file_name in os.listdir(input_directory):
139
+ file_path = os.path.join(input_directory, file_name)
140
+
141
+ if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
142
+ continue
143
+
144
+ if file_path.endswith(".pdf"):
145
+ text = extract_pdf_text(file_path)
146
+ elif file_path.endswith(".docx"):
147
+ text = extract_docx_text(file_path)
148
+ else:
149
+ text = read_text_file(file_path)
150
+
151
+ # تقسيم النص إلى أجزاء لا تتجاوز 450 توكنز
152
+ chunks_450 = split_text_into_chunks(text, arabic_bert_tokenizer, 450)
153
+ for i, chunk in enumerate(chunks_450):
154
+ output_file_450 = os.path.join(output_directory_500, f"{os.path.splitext(file_name)[0]}_part_{i+1}.txt")
155
+ with open(output_file_450, "w", encoding="utf-8") as file:
156
+ file.write(chunk)
157
+
158
+ # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
159
+ def analyze_files(input_directory, output_directory, tokenizer, max_length):
160
+ for file_name in os.listdir(input_directory):
161
+ file_path = os.path.join(input_directory, file_name)
162
+
163
+ if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
164
+ continue
165
+
166
+ with open(file_path, "r", encoding="utf-8") as file:
167
+ text = file.read()
168
+
169
+ chunks = split_text_into_chunks(text, tokenizer, max_length)
170
+
171
+ # إجراء التحليل على النصوص المقسمة
172
+ for chunk in chunks:
173
+ entities = camel_ner_analysis(chunk)
174
+ sentences = extract_sentences(chunk)
175
+ quotes = extract_quotes(chunk)
176
+ token_count = count_tokens(chunk, tokenizer)
177
+ scenes = extract_scenes(chunk)
178
+ ages = extract_ages(chunk)
179
+ character_descriptions = extract_character_descriptions(chunk)
180
+ character_frequency = extract_character_frequency(entities)
181
+ dialogues = extract_dialogues(chunk)
182
+ scene_details = [extract_scene_details(scene) for scene in scenes]
183
+
184
+ # حفظ النتائج
185
+ with open(os.path.join(output_directory, f"{file_name}_entities.txt"), "a", encoding="utf-8") as file:
186
+ file.write(str(entities))
187
+
188
+ with open(os.path.join(output_directory, f"{file_name}_sentences.txt"), "a", encoding="utf-8") as file:
189
+ file.write("\n".join(sentences))
190
+
191
+ with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
192
+ file.write("\n".join(quotes))
193
+
194
+ with open(os.path.join(output_directory, f"{file_name}_token_count.txt"), "a", encoding="utf-8") as file:
195
+ file.write(str(token_count))
196
+
197
+ with open(os.path.join(output_directory, f"{file_name}_scenes.txt"), "a", encoding="utf-8") as file:
198
+ file.write("\n".join(scenes))
199
+
200
+ with open(os.path.join(output_directory, f"{file_name}_scene_details.txt"), "a", encoding="utf-8") as file:
201
+ file.write(str(scene_details))
202
+
203
+ with open(os.path.join(output_directory, f"{file_name}_ages.txt"), "a", encoding="utf-8") as file:
204
+ file.write(str(ages))
205
+
206
+ with open(os.path.join(output_directory, f"{file_name}_character_descriptions.txt"), "a", encoding="utf-8") as file:
207
+ file.write(str(character_descriptions))
208
+
209
+ with open(os.path.join(output_directory, f"{file_name}_character_frequency.txt"), "a", encoding="utf-8") as file:
210
+ file.write(str(character_frequency))
211
+
212
+ with open(os.path.join(output_directory, f"{file_name}_dialogues.txt"), "a", encoding="utf-8") as file:
213
+ file.write(str(dialogues))
214
+
215
+ # تحديد المسارات
216
+ input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
217
+ output_directory_450 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/500"
218
+ input_directory_450 = "/Volumes/CLOCKWORK T/clockworkspace/first pro/500"
219
+ output_directory_450_out = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out/500"
220
+
221
+ # التأكد من وجود المسارات
222
+ os.makedirs(output_directory_450, exist_ok=True)
223
+ os.makedirs(output_directory_450_out, exist_ok=True)
224
+
225
+ # معالجة الملفات وتقسيمها
226
+ process_files(input_directory, output_directory_450)
227
+
228
+ # تحليل الملفات المقسمة إلى 450 توكنز
229
+ analyze_files(input_directory_450, output_directory_450_out, arabic_bert_tokenizer, 512)
230
+
231
+ print("تمت معالجة الملفات وتحليلها بنجاح.")