Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,13 +2,15 @@ import os
|
|
2 |
import re
|
3 |
import torch
|
4 |
from collections import Counter
|
5 |
-
from transformers import pipeline, AutoModel, AutoTokenizer,
|
6 |
import PyPDF2
|
7 |
import openai
|
8 |
import docx
|
9 |
from arabert.preprocess import ArabertPreprocessor
|
10 |
import gradio as gr
|
11 |
|
|
|
|
|
12 |
|
13 |
# التحقق من توفر GPU واستخدامه
|
14 |
device = 0 if torch.cuda.is_available() else -1
|
@@ -121,10 +123,8 @@ def extract_dialogues(text):
|
|
121 |
return dialogues
|
122 |
|
123 |
# دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
|
124 |
-
def process_files(
|
125 |
-
for
|
126 |
-
file_path = os.path.join(input_directory, file_name)
|
127 |
-
|
128 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
|
129 |
continue
|
130 |
|
@@ -138,15 +138,14 @@ def process_files(input_directory, output_directory_950):
|
|
138 |
# تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
|
139 |
chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
|
140 |
for i, chunk in enumerate(chunks_950):
|
141 |
-
output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(
|
142 |
with open(output_file_950, "w", encoding="utf-8") as file:
|
143 |
file.write(chunk)
|
144 |
|
145 |
# دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
|
146 |
-
def analyze_files(
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
|
151 |
continue
|
152 |
|
@@ -166,31 +165,45 @@ def analyze_files(input_directory, output_directory, tokenizer, max_length):
|
|
166 |
dialogues = extract_dialogues(chunk)
|
167 |
scene_details = [extract_scene_details(scene) for scene in scenes]
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
# حفظ النتائج
|
170 |
-
with open(os.path.join(output_directory, f"{
|
171 |
file.write("\n".join(sentences))
|
172 |
|
173 |
-
|
174 |
-
with open(os.path.join(output_directory, f"{file_name}_quotes.txt"), "a", encoding="utf-8") as file:
|
175 |
file.write("\n".join(quotes))
|
176 |
|
177 |
-
with open(os.path.join(output_directory, f"{
|
178 |
file.write(str(token_count))
|
179 |
|
180 |
-
with open(os.path.join(output_directory, f"{
|
181 |
file.write("\n".join(scenes))
|
182 |
|
183 |
-
with open(os.path.join(output_directory, f"{
|
184 |
file.write(str(scene_details))
|
185 |
|
186 |
-
with open(os.path.join(output_directory, f"{
|
187 |
file.write(str(ages))
|
188 |
|
189 |
-
with open(os.path.join(output_directory, f"{
|
190 |
file.write(str(character_descriptions))
|
191 |
|
192 |
-
with open(os.path.join(output_directory, f"{
|
193 |
file.write(str(dialogues))
|
|
|
|
|
194 |
|
195 |
# تحديد المسارات
|
196 |
input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
|
@@ -211,13 +224,22 @@ analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950
|
|
211 |
print("تمت معالجة الملفات وتحليلها بنجاح.")
|
212 |
|
213 |
# تعريف واجهة Gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
interface = gr.Interface(
|
215 |
-
fn=
|
216 |
-
inputs=gr.File(file_count="multiple", type="
|
217 |
-
outputs=
|
218 |
title="Movie Script Analyzer and Completer",
|
219 |
description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
|
220 |
)
|
221 |
|
222 |
if __name__ == "__main__":
|
223 |
-
interface.launch()
|
|
|
2 |
import re
|
3 |
import torch
|
4 |
from collections import Counter
|
5 |
+
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM, AutoModelForTokenClassification
|
6 |
import PyPDF2
|
7 |
import openai
|
8 |
import docx
|
9 |
from arabert.preprocess import ArabertPreprocessor
|
10 |
import gradio as gr
|
11 |
|
12 |
+
# تعيين التوكن الخاص بـ OpenAI
|
13 |
+
openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU"
|
14 |
|
15 |
# التحقق من توفر GPU واستخدامه
|
16 |
device = 0 if torch.cuda.is_available() else -1
|
|
|
123 |
return dialogues
|
124 |
|
125 |
# دالة لمعالجة الملفات وتقسيمها بناءً على عدد التوكنز
|
126 |
+
def process_files(input_files, output_directory_950):
|
127 |
+
for file_path in input_files:
|
|
|
|
|
128 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
|
129 |
continue
|
130 |
|
|
|
138 |
# تقسيم النص إلى أجزاء لا تتجاوز 950 توكنز
|
139 |
chunks_950 = split_text_into_chunks(text, gpt2_tokenizer, 950)
|
140 |
for i, chunk in enumerate(chunks_950):
|
141 |
+
output_file_950 = os.path.join(output_directory_950, f"{os.path.splitext(os.path.basename(file_path))[0]}_part_{i+1}.txt")
|
142 |
with open(output_file_950, "w", encoding="utf-8") as file:
|
143 |
file.write(chunk)
|
144 |
|
145 |
# دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
|
146 |
+
def analyze_files(input_files, output_directory, tokenizer, max_length):
|
147 |
+
results = []
|
148 |
+
for file_path in input_files:
|
|
|
149 |
if os.path.isdir(file_path): # التأكد من أن الملف ليس مجلدًا
|
150 |
continue
|
151 |
|
|
|
165 |
dialogues = extract_dialogues(chunk)
|
166 |
scene_details = [extract_scene_details(scene) for scene in scenes]
|
167 |
|
168 |
+
result = {
|
169 |
+
"sentences": sentences,
|
170 |
+
"quotes": quotes,
|
171 |
+
"token_count": token_count,
|
172 |
+
"scenes": scenes,
|
173 |
+
"scene_details": scene_details,
|
174 |
+
"ages": ages,
|
175 |
+
"character_descriptions": character_descriptions,
|
176 |
+
"dialogues": dialogues
|
177 |
+
}
|
178 |
+
|
179 |
+
results.append(result)
|
180 |
+
|
181 |
# حفظ النتائج
|
182 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_sentences.txt"), "a", encoding="utf-8") as file:
|
183 |
file.write("\n".join(sentences))
|
184 |
|
185 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_quotes.txt"), "a", encoding="utf-8") as file:
|
|
|
186 |
file.write("\n".join(quotes))
|
187 |
|
188 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_token_count.txt"), "a", encoding="utf-8") as file:
|
189 |
file.write(str(token_count))
|
190 |
|
191 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scenes.txt"), "a", encoding="utf-8") as file:
|
192 |
file.write("\n".join(scenes))
|
193 |
|
194 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scene_details.txt"), "a", encoding="utf-8") as file:
|
195 |
file.write(str(scene_details))
|
196 |
|
197 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_ages.txt"), "a", encoding="utf-8") as file:
|
198 |
file.write(str(ages))
|
199 |
|
200 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_character_descriptions.txt"), "a", encoding="utf-8") as file:
|
201 |
file.write(str(character_descriptions))
|
202 |
|
203 |
+
with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_dialogues.txt"), "a", encoding="utf-8") as file:
|
204 |
file.write(str(dialogues))
|
205 |
+
|
206 |
+
return results
|
207 |
|
208 |
# تحديد المسارات
|
209 |
input_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/in"
|
|
|
224 |
print("تمت معالجة الملفات وتحليلها بنجاح.")
|
225 |
|
226 |
# تعريف واجهة Gradio
|
227 |
+
def analyze_and_complete(input_files):
|
228 |
+
# معالجة الملفات وتقسيمها
|
229 |
+
process_files(input_files, output_directory_950)
|
230 |
+
|
231 |
+
# تحليل الملفات المقسمة إلى 950 توكنز
|
232 |
+
results = analyze_files(input_directory_950, output_directory_950_out, gpt2_tokenizer, 950)
|
233 |
+
|
234 |
+
return results
|
235 |
+
|
236 |
interface = gr.Interface(
|
237 |
+
fn=analyze_and_complete,
|
238 |
+
inputs=gr.File(file_count="multiple", type="file"),
|
239 |
+
outputs="json",
|
240 |
title="Movie Script Analyzer and Completer",
|
241 |
description="Upload text, PDF, or DOCX files to analyze and complete the movie script."
|
242 |
)
|
243 |
|
244 |
if __name__ == "__main__":
|
245 |
+
interface.launch(share=True)
|