Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -175,14 +175,21 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
175 |
try:
|
176 |
if file_ext == '.pdf':
|
177 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
doc = fitz.open(file_path)
|
179 |
for page in doc:
|
180 |
text += page.get_text("text") + '\n'
|
181 |
if not text.strip():
|
182 |
-
raise ValueError("PyMuPDF returned empty text")
|
183 |
-
|
184 |
-
logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
|
185 |
-
text = extract_text_from_pdf_with_ocr(file_path)
|
186 |
|
187 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
188 |
text = extract_text_with_ocr(file_path)
|
@@ -201,13 +208,22 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
201 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
202 |
text = ""
|
203 |
try:
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
img
|
209 |
-
|
210 |
-
text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
except Exception as e:
|
212 |
raise ValueError(f"PDF OCR failed: {str(e)}")
|
213 |
return text
|
@@ -434,18 +450,23 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
|
|
434 |
if progress:
|
435 |
progress(0.2, desc="Extracting text from file...")
|
436 |
|
437 |
-
|
|
|
|
|
|
|
438 |
|
439 |
if not text.strip():
|
440 |
-
raise ValueError("
|
441 |
|
442 |
if progress:
|
443 |
progress(0.5, desc="Parsing transcript...")
|
444 |
|
445 |
parser = TranscriptParser()
|
446 |
-
|
|
|
|
|
|
|
447 |
|
448 |
-
# Return simple confirmation message
|
449 |
confirmation = "Transcript processed successfully."
|
450 |
if 'gpa' in parsed_data.get('student_info', {}):
|
451 |
confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
|
@@ -455,6 +476,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
|
|
455 |
except Exception as e:
|
456 |
error_msg = f"Error processing transcript: {str(e)}"
|
457 |
logging.error(error_msg)
|
|
|
458 |
return error_msg, None
|
459 |
|
460 |
# ========== LEARNING STYLE QUIZ ==========
|
|
|
175 |
try:
|
176 |
if file_ext == '.pdf':
|
177 |
try:
|
178 |
+
# First try pdfplumber for better table extraction
|
179 |
+
import pdfplumber
|
180 |
+
with pdfplumber.open(file_path) as pdf:
|
181 |
+
for page in pdf.pages:
|
182 |
+
text += page.extract_text() + '\n'
|
183 |
+
if not text.strip():
|
184 |
+
raise ValueError("PDFPlumber returned empty text")
|
185 |
+
except Exception as e:
|
186 |
+
logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
|
187 |
doc = fitz.open(file_path)
|
188 |
for page in doc:
|
189 |
text += page.get_text("text") + '\n'
|
190 |
if not text.strip():
|
191 |
+
raise ValueError("PyMuPDF returned empty text, trying OCR fallback...")
|
192 |
+
text = extract_text_from_pdf_with_ocr(file_path)
|
|
|
|
|
193 |
|
194 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
195 |
text = extract_text_with_ocr(file_path)
|
|
|
208 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
209 |
text = ""
|
210 |
try:
|
211 |
+
# Try pdf2image first if available
|
212 |
+
try:
|
213 |
+
from pdf2image import convert_from_path
|
214 |
+
images = convert_from_path(file_path)
|
215 |
+
for img in images:
|
216 |
+
text += pytesseract.image_to_string(img) + '\n'
|
217 |
+
return text
|
218 |
+
except ImportError:
|
219 |
+
# Fallback to PyMuPDF for image extraction
|
220 |
+
doc = fitz.open(file_path)
|
221 |
+
for page in doc:
|
222 |
+
pix = page.get_pixmap()
|
223 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
224 |
+
img = img.convert('L')
|
225 |
+
img = img.point(lambda x: 0 if x < 128 else 255)
|
226 |
+
text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
|
227 |
except Exception as e:
|
228 |
raise ValueError(f"PDF OCR failed: {str(e)}")
|
229 |
return text
|
|
|
450 |
if progress:
|
451 |
progress(0.2, desc="Extracting text from file...")
|
452 |
|
453 |
+
try:
|
454 |
+
text = extract_text_from_file(file_obj.name, file_ext)
|
455 |
+
except Exception as e:
|
456 |
+
raise ValueError(f"Failed to extract text: {str(e)}. The file may be corrupted or in an unsupported format.")
|
457 |
|
458 |
if not text.strip():
|
459 |
+
raise ValueError("The file appears to be empty or contains no readable text.")
|
460 |
|
461 |
if progress:
|
462 |
progress(0.5, desc="Parsing transcript...")
|
463 |
|
464 |
parser = TranscriptParser()
|
465 |
+
try:
|
466 |
+
parsed_data = parser.parse_transcript(text)
|
467 |
+
except Exception as e:
|
468 |
+
raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
|
469 |
|
|
|
470 |
confirmation = "Transcript processed successfully."
|
471 |
if 'gpa' in parsed_data.get('student_info', {}):
|
472 |
confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
|
|
|
476 |
except Exception as e:
|
477 |
error_msg = f"Error processing transcript: {str(e)}"
|
478 |
logging.error(error_msg)
|
479 |
+
# Return more detailed error to user
|
480 |
return error_msg, None
|
481 |
|
482 |
# ========== LEARNING STYLE QUIZ ==========
|