mohamedrady commited on
Commit
c28bde1
·
verified ·
1 Parent(s): a531aee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -21
app.py CHANGED
@@ -52,12 +52,12 @@ def count_tokens(text, tokenizer):
52
 
53
  # دالة لاستخراج النص من ملفات PDF
54
  def extract_pdf_text(file_path):
 
55
  with open(file_path, "rb") as pdf_file:
56
  pdf_reader = PyPDF2.PdfReader(pdf_file)
57
- text = ""
58
  for page_num in range(len(pdf_reader.pages)):
59
  page = pdf_reader.pages[page_num]
60
- text += page.extract_text()
61
  return text
62
 
63
  # دالة لاستخراج النص من ملفات DOCX
@@ -81,15 +81,15 @@ def read_text_file(file_path):
81
 
82
  # دالة لاستخراج المشاهد من النص
83
  def extract_scenes(text):
84
- scenes = re.split(r'داخلي|خارجي', text)
85
  scenes = [scene.strip() for scene in scenes if scene.strip()]
86
  return scenes
87
 
88
  # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
89
  def extract_scene_details(scene):
90
  details = {}
91
- location_match = re.search(r'(داخلي|خارجي)', scene)
92
- time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب)', scene)
93
 
94
  if location_match:
95
  details['location'] = location_match.group()
@@ -176,29 +176,30 @@ def analyze_files(input_files, output_directory, tokenizer, max_length):
176
  results.append(result)
177
 
178
  # حفظ النتائج
179
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_sentences.txt"), "a", encoding="utf-8") as file:
180
- file.write("\n".join(sentences))
 
181
 
182
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_quotes.txt"), "a", encoding="utf-8") as file:
183
- file.write("\n".join(quotes))
184
 
185
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_token_count.txt"), "a", encoding="utf-8") as file:
186
- file.write(str(token_count))
187
 
188
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scenes.txt"), "a", encoding="utf-8") as file:
189
- file.write("\n".join(scenes))
190
 
191
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_scene_details.txt"), "a", encoding="utf-8") as file:
192
- file.write(str(scene_details))
193
 
194
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_ages.txt"), "a", encoding="utf-8") as file:
195
- file.write(str(ages))
196
 
197
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_character_descriptions.txt"), "a", encoding="utf-8") as file:
198
- file.write(str(character_descriptions))
199
 
200
- with open(os.path.join(output_directory, f"{os.path.basename(file_path)}_dialogues.txt"), "a", encoding="utf-8") as file:
201
- file.write(str(dialogues))
202
 
203
  return results
204
 
 
52
 
53
  # دالة لاستخراج النص من ملفات PDF
54
  def extract_pdf_text(file_path):
55
+ text = ""
56
  with open(file_path, "rb") as pdf_file:
57
  pdf_reader = PyPDF2.PdfReader(pdf_file)
 
58
  for page_num in range(len(pdf_reader.pages)):
59
  page = pdf_reader.pages[page_num]
60
+ text += page.extract_text() or ""
61
  return text
62
 
63
  # دالة لاستخراج النص من ملفات DOCX
 
81
 
82
  # دالة لاستخراج المشاهد من النص
83
  def extract_scenes(text):
84
+ scenes = re.split(r'داخلي|خارجي|... داخلي ...|... خارجي ...', text)
85
  scenes = [scene.strip() for scene in scenes if scene.strip()]
86
  return scenes
87
 
88
  # دالة لاستخراج تفاصيل المشهد (المكان والوقت)
89
  def extract_scene_details(scene):
90
  details = {}
91
+ location_match = re.search(r'(داخلي|خارجي|... داخلي ...|... خارجي ...)', scene)
92
+ time_match = re.search(r'(ليلاً|نهاراً|شروق|غروب|... ليل ...|... نهار ...)', scene)
93
 
94
  if location_match:
95
  details['location'] = location_match.group()
 
176
  results.append(result)
177
 
178
  # حفظ النتائج
179
+ base_filename = os.path.basename(file_path)
180
+ with open(os.path.join(output_directory, f"{base_filename}_sentences.txt"), "a", encoding="utf-8") as file:
181
+ file.write("\n".join(sentences) + "\n")
182
 
183
+ with open(os.path.join(output_directory, f"{base_filename}_quotes.txt"), "a", encoding="utf-8") as file:
184
+ file.write("\n".join(quotes) + "\n")
185
 
186
+ with open(os.path.join(output_directory, f"{base_filename}_token_count.txt"), "a", encoding="utf-8") as file:
187
+ file.write(str(token_count) + "\n")
188
 
189
+ with open(os.path.join(output_directory, f"{base_filename}_scenes.txt"), "a", encoding="utf-8") as file:
190
+ file.write("\n".join(scenes) + "\n")
191
 
192
+ with open(os.path.join(output_directory, f"{base_filename}_scene_details.txt"), "a", encoding="utf-8") as file:
193
+ file.write(str(scene_details) + "\n")
194
 
195
+ with open(os.path.join(output_directory, f"{base_filename}_ages.txt"), "a", encoding="utf-8") as file:
196
+ file.write(str(ages) + "\n")
197
 
198
+ with open(os.path.join(output_directory, f"{base_filename}_character_descriptions.txt"), "a", encoding="utf-8") as file:
199
+ file.write(str(character_descriptions) + "\n")
200
 
201
+ with open(os.path.join(output_directory, f"{base_filename}_dialogues.txt"), "a", encoding="utf-8") as file:
202
+ file.write(str(dialogues) + "\n")
203
 
204
  return results
205