Dannyar608 commited on
Commit
058e198
·
verified ·
1 Parent(s): 2084b35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -10
app.py CHANGED
@@ -23,7 +23,6 @@ import asyncio
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
26
- import pdfplumber
27
 
28
  # ========== CONFIGURATION ==========
29
  PROFILES_DIR = "student_profiles"
@@ -197,20 +196,16 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
197
 
198
  try:
199
  if file_ext == '.pdf':
200
- # First try pdfplumber for better text extraction
201
  try:
202
- with pdfplumber.open(file_path) as pdf:
203
- text = "\n".join([page.extract_text() for page in pdf.pages])
204
- if not text.strip():
205
- raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
206
- except Exception as e:
207
- logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
208
  doc = fitz.open(file_path)
209
  for page in doc:
210
  text += page.get_text("text") + '\n'
211
  if not text.strip():
212
- raise ValueError("PyMuPDF returned empty text - trying OCR fallback...")
213
- text = extract_text_from_pdf_with_ocr(file_path)
 
 
214
 
215
  elif file_ext in ['.png', '.jpg', '.jpeg']:
216
  text = extract_text_with_ocr(file_path)
 
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
 
26
 
27
  # ========== CONFIGURATION ==========
28
  PROFILES_DIR = "student_profiles"
 
196
 
197
  try:
198
  if file_ext == '.pdf':
199
+ # First try PyMuPDF for text extraction
200
  try:
 
 
 
 
 
 
201
  doc = fitz.open(file_path)
202
  for page in doc:
203
  text += page.get_text("text") + '\n'
204
  if not text.strip():
205
+ raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
206
+ except Exception as e:
207
+ logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
208
+ text = extract_text_from_pdf_with_ocr(file_path)
209
 
210
  elif file_ext in ['.png', '.jpg', '.jpeg']:
211
  text = extract_text_with_ocr(file_path)