mohamedrady commited on
Commit
dde43be
·
verified ·
1 Parent(s): bbe3161

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -23
app.py CHANGED
@@ -2,13 +2,15 @@ import os
2
  import re
3
  import torch
4
  from collections import Counter
5
- from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForTokenClassification, AutoModelForCausalLM
6
  import PyPDF2
7
  import openai
8
  import docx
9
  from arabert.preprocess import ArabertPreprocessor
10
  import gradio as gr
11
 
 
 
12
 
13
  # التحقق من توفر GPU واستخدامه
14
  device = 0 if torch.cuda.is_available() else -1
@@ -121,10 +123,8 @@ def extract_dialogues(text):
121
  return dialogues
122
 
123
  # دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
124
- def process_files(input_directory, output_directory_950):
125
- for file_name in os.listdir(input_directory):
126
- file_path = os.path.join(input_directory, file_name)
127
-
128
  if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
129
  continue
130
 
@@ -138,15 +138,14 @@ def process_files(input_directory, output_directory_950):
138
  # تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
139
  chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
140
  for i, chunk in enumerate(chunks_950):
141
- output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(file_name)[0]}_part_{i+1}.txt")
142
  with open(output_file_950, "w", encoding="utf-8") as file:
143
  file.write(chunk)
144
 
145
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
146
- def analyze_files(input_directory, output_directory, tokenizer, max_length):
147
- for file_name in os.listdir(input_directory):
148
- file_path = os.path.join(input_directory, file_name)
149
-
150
  if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
151
  continue
152
 
@@ -166,31 +165,45 @@ def analyze_files(input_directory, output_directory, tokenizer, max_length):
166
  dialogues = extract_dialogues(chunk)
167
  scene_details = [extract_scene_details(scene) for scene in scenes]
168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  # حفظ النتائج
170
- with open(os.path.join(output_directory, f"{file_name}_sentences.txt"), "a", encoding="utf-8") as file:
171
  file.write("\n".join(sentences))
172
 
173
-
174
- with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
175
  file.write("\n".join(quotes))
176
 
177
- with open(os.path.join(output_directory, f"{file_name}_token_count.txt"), "a", encoding="utf-8") as file:
178
  file.write(str(token_count))
179
 
180
- with open(os.path.join(output_directory, f"{file_name}_scenes.txt"), "a", encoding="utf-8") as file:
181
  file.write("\n".join(scenes))
182
 
183
- with open(os.path.join(output_directory, f"{file_name}_scene_details.txt"), "a", encoding="utf-8") as file:
184
  file.write(str(scene_details))
185
 
186
- with open(os.path.join(output_directory, f"{file_name}_ages.txt"), "a", encoding="utf-8") as file:
187
  file.write(str(ages))
188
 
189
- with open(os.path.join(output_directory, f"{file_name}_character_descriptions.txt"), "a", encoding="utf-8") as file:
190
  file.write(str(character_descriptions))
191
 
192
- with open(os.path.join(output_directory, f"{file_name}_dialogues.txt"), "a", encoding="utf-8") as file:
193
  file.write(str(dialogues))
 
 
194
 
195
  # تحديد المسارات
196
  input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
@@ -211,13 +224,22 @@ analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950
211
  print("تمت معالجة الملفات وتحليلها بنجاح.")
212
 
213
  # تعريف واجهة Gradio
 
 
 
 
 
 
 
 
 
214
  interface = gr.Interface(
215
- fn=analyze_files,
216
- inputs=gr.File(file_count="multiple", type="filepath"),
217
- outputs=gr.JSON(),
218
  title="Movie Script Analyzer and Completer",
219
  description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
220
  )
221
 
222
  if __name__ == "__main__":
223
- interface.launch()
 
2
  import re
3
  import torch
4
  from collections import Counter
5
+ from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
6
  import PyPDF2
7
  import openai
8
  import docx
9
  from arabert.preprocess import ArabertPreprocessor
10
  import gradio as gr
11
 
12
+ # تعيين التوكن الخاص بـ OpenAI
13
+ openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU"
14
 
15
  # التحقق من توفر GPU واستخدامه
16
  device = 0 if torch.cuda.is_available() else -1
 
123
  return dialogues
124
 
125
  # دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
126
+ def process_files(input_files, output_directory_950):
127
+ for file_path in input_files:
 
 
128
  if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
129
  continue
130
 
 
138
  # تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
139
  chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
140
  for i, chunk in enumerate(chunks_950):
141
+ output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(os.path.basename(file_path))[0]}_part_{i+1}.txt")
142
  with open(output_file_950, "w", encoding="utf-8") as file:
143
  file.write(chunk)
144
 
145
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
146
+ def analyze_files(input_files, output_directory, tokenizer, max_length):
147
+ results = []
148
+ for file_path in input_files:
 
149
  if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
150
  continue
151
 
 
165
  dialogues = extract_dialogues(chunk)
166
  scene_details = [extract_scene_details(scene) for scene in scenes]
167
 
168
+ result = {
169
+ "sentences": sentences,
170
+ "quotes": quotes,
171
+ "token_count": token_count,
172
+ "scenes": scenes,
173
+ "scene_details": scene_details,
174
+ "ages": ages,
175
+ "character_descriptions": character_descriptions,
176
+ "dialogues": dialogues
177
+ }
178
+
179
+ results.append(result)
180
+
181
  # حفظ النتائج
182
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_sentences.txt"), "a", encoding="utf-8") as file:
183
  file.write("\n".join(sentences))
184
 
185
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_quotes.txt"), "a", encoding="utf-8") as file:
 
186
  file.write("\n".join(quotes))
187
 
188
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_token_count.txt"), "a", encoding="utf-8") as file:
189
  file.write(str(token_count))
190
 
191
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scenes.txt"), "a", encoding="utf-8") as file:
192
  file.write("\n".join(scenes))
193
 
194
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scene_details.txt"), "a", encoding="utf-8") as file:
195
  file.write(str(scene_details))
196
 
197
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_ages.txt"), "a", encoding="utf-8") as file:
198
  file.write(str(ages))
199
 
200
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_character_descriptions.txt"), "a", encoding="utf-8") as file:
201
  file.write(str(character_descriptions))
202
 
203
+ with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_dialogues.txt"), "a", encoding="utf-8") as file:
204
  file.write(str(dialogues))
205
+
206
+ return results
207
 
208
  # تحديد المسارات
209
  input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
 
224
  print("تمت معالجة الملفات وتحليلها بنجاح.")
225
 
226
  # تعريف واجهة Gradio
227
+ def analyze_and_complete(input_files):
228
+ # معالجة الملفات وتقسيمها
229
+ process_files(input_files, output_directory_950)
230
+
231
+ # تحليل الملفات المقسمة إلى 950 توكنز
232
+ results = analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950)
233
+
234
+ return results
235
+
236
  interface = gr.Interface(
237
+ fn=analyze_and_complete,
238
+ inputs=gr.File(file_count="multiple", type="file"),
239
+ outputs="json",
240
  title="Movie Script Analyzer and Completer",
241
  description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
242
  )
243
 
244
  if __name__ == "__main__":
245
+ interface.launch(share=True)