mohamedrady commited on
Commit
260a06d
·
verified ·
1 Parent(s): 49fcff6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -38
app.py CHANGED
@@ -9,21 +9,19 @@ from transformers import pipeline, AutoModel, AutoTokenizer
9
  import PyPDF2
10
  import gradio as gr
11
  import openai
12
- from haystack.nodes import FARMReader
13
- from paddlenlp import Taskflow
 
14
 
15
  # تحميل وتفعيل الأدوات المطلوبة
16
  nltk.download('punkt')
17
 
18
  # التحقق من توفر GPU واستخدامه
19
- device = 0 if torch.cuda.is_available() else -1
20
 
21
  # تحميل نماذج التحليل اللغوي
22
  analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
23
 
24
- # تحميل نموذج التعرف على الكيانات في camel_tools
25
- ner = NERecognizer.pretrained()
26
-
27
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
28
  arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
29
  arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
@@ -37,22 +35,14 @@ arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-dis
37
  arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
38
  arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
39
 
40
- # إعداد OpenAI API
41
- openai.api_key = os.getenv("sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU")
42
-
43
- # إعداد farm-haystack
44
- reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
45
-
46
- # إعداد paddlenlp
47
- ner_task = Taskflow("ner")
48
-
49
  # دالة لتحليل النص باستخدام camel_tools
50
  def camel_ner_analysis(text):
 
51
  tokens = simple_word_tokenize(text)
52
  entities = ner.predict(tokens)
53
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
54
  for token, tag in zip(tokens, entities):
55
- if tag in entity_dict:
56
  entity_dict[tag].append((token, tag))
57
  return entity_dict
58
 
@@ -71,7 +61,7 @@ def nltk_extract_quotes(text):
71
  quotes = []
72
  sentences = nltk.tokenize.sent_tokenize(text, language='arabic')
73
  for sentence in sentences:
74
- if '"' in sentence or '«' in sentence or '»' in sentence:
75
  quotes.append(sentence)
76
  return quotes
77
 
@@ -82,10 +72,10 @@ def count_tokens(text):
82
 
83
  # دالة لاستخراج النص من ملفات PDF
84
  def extract_pdf_text(file_path):
85
- with open(file_path, "rb") as pdf_file:
86
  pdf_reader = PyPDF2.PdfReader(pdf_file)
87
  text = ""
88
- for page_num in range(len(pdf_reader.pages)):
89
  page = pdf_reader.pages[page_num]
90
  text += page.extract_text()
91
  return text
@@ -93,7 +83,7 @@ def extract_pdf_text(file_path):
93
  # دالة لاستخراج المشاهد من النص
94
  def extract_scenes(text):
95
  scenes = re.split(r'داخلي|خارجي', text)
96
- scenes = [scene.strip() for scene in scenes if scene.strip()]
97
  return scenes
98
 
99
  # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
@@ -102,9 +92,9 @@ def extract_scene_details(scene):
102
  location_match = re.search(r'(داخلي|خارجي)', scene)
103
  time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
104
 
105
- if location_match:
106
  details['location'] = location_match.group()
107
- if time_match:
108
  details['time'] = time_match.group()
109
 
110
  return details
@@ -135,11 +125,11 @@ def analyze_and_complete(file_paths):
135
  results = []
136
  output_directory = os.getenv("SPACE_DIR", "/app/output")
137
 
138
- for file_path in file_paths:
139
- if file_path.endswith(".pdf"):
140
  text = extract_pdf_text(file_path)
141
  else:
142
- with open(file_path, "r", encoding="utf-8") as file:
143
  text = file.read()
144
 
145
  filename_prefix = os.path.splitext(os.path.basename(file_path))[0]
@@ -155,47 +145,47 @@ def analyze_and_complete(file_paths):
155
  character_frequency = extract_character_frequency(camel_entities)
156
  dialogues = extract_dialogues(text)
157
 
158
- scene_details = [extract_scene_details(scene) for scene in scenes]
159
 
160
  # حفظ النتائج إلى ملفات
161
- with open(os.path.join(output_directory, f"{filename_prefix}_entities.txt"), "w", encoding="utf-8") as file:
162
  file.write(str(camel_entities))
163
 
164
- with open(os.path.join(output_directory, f"{filename_prefix}_sentiments.txt"), "w", encoding="utf-8") as file:
165
  file.write(str(sentiments))
166
 
167
- with open(os.path.join(output_directory, f"{filename_prefix}_sentences.txt"), "w", encoding="utf-8") as file:
168
  file.write("\n".join(sentences))
169
 
170
- with open(os.path.join(output_directory, f"{filename_prefix}_quotes.txt"), "w", encoding="utf-8") as file:
171
  file.write("\n".join(quotes))
172
 
173
- with open(os.path.join(output_directory, f"{filename_prefix}_token_count.txt"), "w", encoding="utf-8") as file:
174
  file.write(str(token_count))
175
 
176
- with open(os.path.join(output_directory, f"{filename_prefix}_scenes.txt"), "w", encoding="utf-8") as file:
177
  file.write("\n".join(scenes))
178
 
179
- with open(os.path.join(output_directory, f"{filename_prefix}_scene_details.txt"), "w", encoding="utf-8") as file:
180
  file.write(str(scene_details))
181
 
182
- with open(os.path.join(output_directory, f"{filename_prefix}_ages.txt"), "w", encoding="utf-8") as file:
183
  file.write(str(ages))
184
 
185
- with open(os.path.join(output_directory, f"{filename_prefix}_character_descriptions.txt"), "w", encoding="utf-8") as file:
186
  file.write(str(character_descriptions))
187
 
188
- with open(os.path.join(output_directory, f"{filename_prefix}_character_frequency.txt"), "w", encoding="utf-8") as file:
189
  file.write(str(character_frequency))
190
 
191
- with open(os.path.join(output_directory, f"{filename_prefix}_dialogues.txt"), "w", encoding="utf-8") as file:
192
  file.write(str(dialogues))
193
 
194
  results.append((str(camel_entities), str(sentiments), "\n".join(sentences), "\n".join(quotes), str(token_count), "\n".join(scenes), str(scene_details), str(ages), str(character_descriptions), str(character_frequency), str(dialogues)))
195
 
196
  return results
197
 
198
- # تعريف واجهة Gradio
199
  interface = gr.Interface(
200
  fn=analyze_and_complete,
201
  inputs=gr.File(file_count="multiple", type="filepath"),
 
9
  import PyPDF2
10
  import gradio as gr
11
  import openai
12
+
13
+ # تعيين التوكن الخاص بـ OpenAI
14
+ openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU"
15
 
16
  # تحميل وتفعيل الأدوات المطلوبة
17
  nltk.download('punkt')
18
 
19
  # التحقق من توفر GPU واستخدامه
20
+ device = 0 إذا torch.cuda.is_available() else -1
21
 
22
  # تحميل نماذج التحليل اللغوي
23
  analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
24
 
 
 
 
25
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
26
  arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
27
  arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
 
35
  arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
36
  arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
37
 
 
 
 
 
 
 
 
 
 
38
  # دالة لتحليل النص باستخدام camel_tools
39
  def camel_ner_analysis(text):
40
+ ner = NERecognizer.pretrained()
41
  tokens = simple_word_tokenize(text)
42
  entities = ner.predict(tokens)
43
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
44
  for token, tag in zip(tokens, entities):
45
+ إذا tag in entity_dict:
46
  entity_dict[tag].append((token, tag))
47
  return entity_dict
48
 
 
61
  quotes = []
62
  sentences = nltk.tokenize.sent_tokenize(text, language='arabic')
63
  for sentence in sentences:
64
+ إذا '"' in sentence أو '«' in sentence أو '»' in sentence:
65
  quotes.append(sentence)
66
  return quotes
67
 
 
72
 
73
  # دالة لاستخراج النص من ملفات PDF
74
  def extract_pdf_text(file_path):
75
+ مع open(file_path, "rb") كما pdf_file:
76
  pdf_reader = PyPDF2.PdfReader(pdf_file)
77
  text = ""
78
+ لكل page_num in range(len(pdf_reader.pages)):
79
  page = pdf_reader.pages[page_num]
80
  text += page.extract_text()
81
  return text
 
83
  # دالة لاستخراج المشاهد من النص
84
  def extract_scenes(text):
85
  scenes = re.split(r'داخلي|خارجي', text)
86
+ scenes = [scene.strip() for scene in scenes إذا scene.strip()]
87
  return scenes
88
 
89
  # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
 
92
  location_match = re.search(r'(داخلي|خارجي)', scene)
93
  time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
94
 
95
+ إذا location_match:
96
  details['location'] = location_match.group()
97
+ إذا time_match:
98
  details['time'] = time_match.group()
99
 
100
  return details
 
125
  results = []
126
  output_directory = os.getenv("SPACE_DIR", "/app/output")
127
 
128
+ لكل file_path in file_paths:
129
+ إذا file_path.endswith(".pdf"):
130
  text = extract_pdf_text(file_path)
131
  else:
132
+ مع open(file_path, "r", encoding="utf-8") كما file:
133
  text = file.read()
134
 
135
  filename_prefix = os.path.splitext(os.path.basename(file_path))[0]
 
145
  character_frequency = extract_character_frequency(camel_entities)
146
  dialogues = extract_dialogues(text)
147
 
148
+ scene_details = [extract_scene_details(scene) لكل scene in scenes]
149
 
150
  # حفظ النتائج إلى ملفات
151
+ مع open(os.path.join(output_directory, f"{filename_prefix}_entities.txt"), "w", encoding="utf-8") كما file:
152
  file.write(str(camel_entities))
153
 
154
+ مع open(os.path.join(output_directory, f"{filename_prefix}_sentiments.txt"), "w", encoding="utf-8") كما file:
155
  file.write(str(sentiments))
156
 
157
+ مع open(os.path.join(output_directory, f"{filename_prefix}_sentences.txt"), "w", encoding="utf-8") كما file:
158
  file.write("\n".join(sentences))
159
 
160
+ مع open(os.path.join(output_directory, f"{filename_prefix}_quotes.txt"), "w", encoding="utf-8") كما file:
161
  file.write("\n".join(quotes))
162
 
163
+ مع open(os.path.join(output_directory, f"{filename_prefix}_token_count.txt"), "w", encoding="utf-8") كما file:
164
  file.write(str(token_count))
165
 
166
+ مع open(os.path.join(output_directory, f"{filename_prefix}_scenes.txt"), "w", encoding="utf-8") كما file:
167
  file.write("\n".join(scenes))
168
 
169
+ مع open(os.path.join(output_directory, f"{filename_prefix}_scene_details.txt"), "w", encoding="utf-8") كما file:
170
  file.write(str(scene_details))
171
 
172
+ مع open(os.path.join(output_directory, f"{filename_prefix}_ages.txt"), "w", encoding="utf-8") كما file:
173
  file.write(str(ages))
174
 
175
+ مع open(os.path.join(output_directory, f"{filename_prefix}_character_descriptions.txt"), "w", encoding="utf-8") كما file:
176
  file.write(str(character_descriptions))
177
 
178
+ مع open(os.path.join(output_directory, f"{filename_prefix}_character_frequency.txt"), "w", encoding="utf-8") كما file:
179
  file.write(str(character_frequency))
180
 
181
+ مع open(os.path.join(output_directory, f"{filename_prefix}_dialogues.txt"), "w", encoding="utf-8") كما file:
182
  file.write(str(dialogues))
183
 
184
  results.append((str(camel_entities), str(sentiments), "\n".join(sentences), "\n".join(quotes), str(token_count), "\n".join(scenes), str(scene_details), str(ages), str(character_descriptions), str(character_frequency), str(dialogues)))
185
 
186
  return results
187
 
188
+ ## تعريف واجهة Gradio
189
  interface = gr.Interface(
190
  fn=analyze_and_complete,
191
  inputs=gr.File(file_count="multiple", type="filepath"),