Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -23,7 +23,6 @@ import asyncio
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
26 |
-
import pdfplumber
|
27 |
|
28 |
# ========== CONFIGURATION ==========
|
29 |
PROFILES_DIR = "student_profiles"
|
@@ -197,20 +196,16 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
197 |
|
198 |
try:
|
199 |
if file_ext == '.pdf':
|
200 |
-
# First try
|
201 |
try:
|
202 |
-
with pdfplumber.open(file_path) as pdf:
|
203 |
-
text = "\n".join([page.extract_text() for page in pdf.pages])
|
204 |
-
if not text.strip():
|
205 |
-
raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
|
206 |
-
except Exception as e:
|
207 |
-
logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
|
208 |
doc = fitz.open(file_path)
|
209 |
for page in doc:
|
210 |
text += page.get_text("text") + '\n'
|
211 |
if not text.strip():
|
212 |
-
raise ValueError("PyMuPDF returned empty text -
|
213 |
-
|
|
|
|
|
214 |
|
215 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
216 |
text = extract_text_with_ocr(file_path)
|
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
26 |
|
27 |
# ========== CONFIGURATION ==========
|
28 |
PROFILES_DIR = "student_profiles"
|
|
|
196 |
|
197 |
try:
|
198 |
if file_ext == '.pdf':
|
199 |
+
# First try PyMuPDF for text extraction
|
200 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
doc = fitz.open(file_path)
|
202 |
for page in doc:
|
203 |
text += page.get_text("text") + '\n'
|
204 |
if not text.strip():
|
205 |
+
raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
|
206 |
+
except Exception as e:
|
207 |
+
logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
|
208 |
+
text = extract_text_from_pdf_with_ocr(file_path)
|
209 |
|
210 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
211 |
text = extract_text_with_ocr(file_path)
|