Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -23,6 +23,7 @@ import asyncio
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
26 |
|
27 |
# ========== CONFIGURATION ==========
|
28 |
PROFILES_DIR = "student_profiles"
|
@@ -196,16 +197,20 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
196 |
|
197 |
try:
|
198 |
if file_ext == '.pdf':
|
199 |
-
# First try
|
200 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
doc = fitz.open(file_path)
|
202 |
for page in doc:
|
203 |
text += page.get_text("text") + '\n'
|
204 |
if not text.strip():
|
205 |
-
raise ValueError("PyMuPDF returned empty text -
|
206 |
-
|
207 |
-
logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
|
208 |
-
text = extract_text_from_pdf_with_ocr(file_path)
|
209 |
|
210 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
211 |
text = extract_text_with_ocr(file_path)
|
@@ -293,58 +298,20 @@ class TranscriptParser:
|
|
293 |
self.current_courses = []
|
294 |
self.course_history = []
|
295 |
self.graduation_status = {}
|
296 |
-
self.supported_formats = {
|
297 |
-
'miami_dade': self.parse_miami_dade,
|
298 |
-
'standard': self.parse_standard,
|
299 |
-
'homeschool': self.parse_homeschool
|
300 |
-
}
|
301 |
|
302 |
def parse_transcript(self, text: str) -> Dict:
|
303 |
-
"""
|
304 |
try:
|
305 |
-
#
|
306 |
-
text = re.sub(r'\s+', ' ', text)
|
307 |
-
|
308 |
-
# Detect transcript format
|
309 |
-
format_type = self.detect_format(text)
|
310 |
-
|
311 |
-
# Parse based on detected format
|
312 |
-
if format_type in self.supported_formats:
|
313 |
-
return self.supported_formats[format_type](text)
|
314 |
-
else:
|
315 |
-
# Fallback to standard parsing
|
316 |
-
return self.parse_standard(text)
|
317 |
-
|
318 |
-
except Exception as e:
|
319 |
-
logging.error(f"Error parsing transcript: {str(e)}")
|
320 |
-
raise gr.Error(f"Error parsing transcript: {str(e)}\n\nThis may be due to an unsupported transcript format. Please ensure you're uploading an official transcript or contact support.")
|
321 |
-
|
322 |
-
def detect_format(self, text: str) -> str:
|
323 |
-
"""Detect the transcript format"""
|
324 |
-
# Check for Miami-Dade specific patterns
|
325 |
-
if re.search(r'MIAMI-DADE (COUNTY|COUNTRY) PUBLIC SCHOOLS', text, re.IGNORECASE):
|
326 |
-
return 'miami_dade'
|
327 |
-
# Check for homeschool patterns
|
328 |
-
elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
|
329 |
-
return 'homeschool'
|
330 |
-
# Default to standard format
|
331 |
-
return 'standard'
|
332 |
-
|
333 |
-
def parse_miami_dade(self, text: str) -> Dict:
|
334 |
-
"""Parse Miami-Dade formatted transcripts with enhanced error handling"""
|
335 |
-
try:
|
336 |
-
# Extract student info with more robust patterns
|
337 |
student_match = re.search(
|
338 |
-
r"(\d{7})\s*-\s*([A-Z
|
339 |
-
r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
|
340 |
-
r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
|
341 |
text, re.DOTALL
|
342 |
)
|
343 |
-
|
344 |
if student_match:
|
345 |
self.student_data = {
|
346 |
-
"id": student_match.group(1),
|
347 |
-
"name": student_match.group(2).replace(",", ", "),
|
348 |
"current_grade": student_match.group(3),
|
349 |
"graduation_year": student_match.group(4),
|
350 |
"unweighted_gpa": float(student_match.group(5)),
|
@@ -352,59 +319,51 @@ class TranscriptParser:
|
|
352 |
"total_credits": float(student_match.group(7)),
|
353 |
"community_service_hours": int(student_match.group(8))
|
354 |
}
|
355 |
-
|
356 |
-
# Extract requirements
|
|
|
357 |
req_section = re.search(
|
358 |
-
r"Code
|
359 |
text, re.DOTALL
|
360 |
)
|
361 |
-
|
362 |
if req_section:
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
text, re.DOTALL
|
382 |
)
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
"district_number": match.group(7),
|
399 |
-
"grade": match.group(8),
|
400 |
-
"inclusion_status": match.group(9),
|
401 |
-
"credits": match.group(10)
|
402 |
-
})
|
403 |
-
|
404 |
-
# Identify current courses
|
405 |
self._extract_current_courses()
|
406 |
self._calculate_completion()
|
407 |
-
|
408 |
return {
|
409 |
"student_info": self.student_data,
|
410 |
"requirements": self.requirements,
|
@@ -413,85 +372,13 @@ class TranscriptParser:
|
|
413 |
"graduation_status": self.graduation_status,
|
414 |
"format": "miami_dade"
|
415 |
}
|
416 |
-
|
417 |
-
except Exception as e:
|
418 |
-
logging.error(f"Error parsing Miami-Dade transcript: {str(e)}")
|
419 |
-
raise ValueError(f"Couldn't parse transcript. Please ensure it's a valid Miami-Dade transcript. Error: {str(e)}")
|
420 |
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
student_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
|
425 |
-
if student_match:
|
426 |
-
self.student_data["name"] = student_match.group(1).strip()
|
427 |
-
|
428 |
-
# Extract courses - looking for a table-like structure
|
429 |
-
course_pattern = r"(?P<year>\d{4}-\d{4}|\d{1,2})\s+(?P<subject>\w+)\s+(?P<code>\w+)\s+(?P<title>[^\n]+)\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
|
430 |
-
course_matches = re.finditer(course_pattern, text)
|
431 |
-
|
432 |
-
for match in course_matches:
|
433 |
-
self.course_history.append({
|
434 |
-
"school_year": match.group("year"),
|
435 |
-
"subject": match.group("subject"),
|
436 |
-
"course_code": match.group("code"),
|
437 |
-
"description": match.group("title").strip(),
|
438 |
-
"grade": match.group("grade"),
|
439 |
-
"credits": match.group("credit")
|
440 |
-
})
|
441 |
-
|
442 |
-
# Extract GPA info
|
443 |
-
gpa_pattern = r"GPA\s*([\d.]+)\s*/\s*([\d.]+)"
|
444 |
-
gpa_match = re.search(gpa_pattern, text)
|
445 |
-
if gpa_match:
|
446 |
-
self.student_data.update({
|
447 |
-
"unweighted_gpa": float(gpa_match.group(1)),
|
448 |
-
"weighted_gpa": float(gpa_match.group(2))
|
449 |
-
})
|
450 |
-
|
451 |
-
return {
|
452 |
-
"student_info": self.student_data,
|
453 |
-
"course_history": self.course_history,
|
454 |
-
"format": "standard"
|
455 |
-
}
|
456 |
-
|
457 |
-
def parse_homeschool(self, text: str) -> Dict:
|
458 |
-
"""Parse homeschool formatted transcripts"""
|
459 |
-
# Extract student info
|
460 |
-
name_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
|
461 |
-
if name_match:
|
462 |
-
self.student_data["name"] = name_match.group(1).strip()
|
463 |
-
|
464 |
-
# Extract homeschool-specific info
|
465 |
-
parent_match = re.search(r"Parent:\s*([^\n]+)", text, re.IGNORECASE)
|
466 |
-
if parent_match:
|
467 |
-
self.student_data["parent"] = parent_match.group(1).strip()
|
468 |
-
|
469 |
-
# Extract courses - homeschool format often has simpler tables
|
470 |
-
course_pattern = r"(?P<subject>\w+)\s+(?P<title>[^\n]+?)\s+(?P<date>\w+-\d{4})\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
|
471 |
-
course_matches = re.finditer(course_pattern, text)
|
472 |
-
|
473 |
-
for match in course_matches:
|
474 |
-
self.course_history.append({
|
475 |
-
"subject": match.group("subject"),
|
476 |
-
"description": match.group("title").strip(),
|
477 |
-
"completion_date": match.group("date"),
|
478 |
-
"grade": match.group("grade"),
|
479 |
-
"credits": match.group("credit")
|
480 |
-
})
|
481 |
-
|
482 |
-
# Extract GPA info
|
483 |
-
gpa_match = re.search(r"Cumulative GPA:\s*([\d.]+)", text, re.IGNORECASE)
|
484 |
-
if gpa_match:
|
485 |
-
self.student_data["gpa"] = float(gpa_match.group(1))
|
486 |
-
|
487 |
-
return {
|
488 |
-
"student_info": self.student_data,
|
489 |
-
"course_history": self.course_history,
|
490 |
-
"format": "homeschool"
|
491 |
-
}
|
492 |
|
493 |
def _extract_current_courses(self):
|
494 |
-
"""Identify
|
495 |
self.current_courses = [
|
496 |
{
|
497 |
"course": c["description"],
|
@@ -501,32 +388,21 @@ class TranscriptParser:
|
|
501 |
"credits": c["credits"],
|
502 |
"grade_level": c["grade_level"]
|
503 |
}
|
504 |
-
for c in self.course_history
|
505 |
-
if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
|
506 |
]
|
507 |
-
|
508 |
def _calculate_completion(self):
|
509 |
-
"""
|
510 |
total_required = sum(req["required"] for req in self.requirements.values())
|
511 |
total_completed = sum(req["completed"] for req in self.requirements.values())
|
512 |
-
|
513 |
self.graduation_status.update({
|
514 |
"total_required_credits": total_required,
|
515 |
"total_completed_credits": total_completed,
|
516 |
-
"percent_complete": round((total_completed / total_required) * 100, 1),
|
517 |
"remaining_credits": total_required - total_completed,
|
518 |
-
"on_track": (total_completed / total_required) >= 0.75
|
519 |
})
|
520 |
-
|
521 |
-
def to_json(self) -> str:
|
522 |
-
"""Export parsed data as JSON"""
|
523 |
-
return json.dumps({
|
524 |
-
"student_info": self.student_data,
|
525 |
-
"requirements": self.requirements,
|
526 |
-
"current_courses": self.current_courses,
|
527 |
-
"course_history": self.course_history,
|
528 |
-
"graduation_status": self.graduation_status
|
529 |
-
}, indent=2)
|
530 |
|
531 |
def format_transcript_output(data: Dict) -> str:
|
532 |
"""Enhanced formatting for transcript output with format awareness"""
|
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
26 |
+
import pdfplumber
|
27 |
|
28 |
# ========== CONFIGURATION ==========
|
29 |
PROFILES_DIR = "student_profiles"
|
|
|
197 |
|
198 |
try:
|
199 |
if file_ext == '.pdf':
|
200 |
+
# First try pdfplumber for better text extraction
|
201 |
try:
|
202 |
+
with pdfplumber.open(file_path) as pdf:
|
203 |
+
text = "\n".join([page.extract_text() for page in pdf.pages])
|
204 |
+
if not text.strip():
|
205 |
+
raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
|
206 |
+
except Exception as e:
|
207 |
+
logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
|
208 |
doc = fitz.open(file_path)
|
209 |
for page in doc:
|
210 |
text += page.get_text("text") + '\n'
|
211 |
if not text.strip():
|
212 |
+
raise ValueError("PyMuPDF returned empty text - trying OCR fallback...")
|
213 |
+
text = extract_text_from_pdf_with_ocr(file_path)
|
|
|
|
|
214 |
|
215 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
216 |
text = extract_text_with_ocr(file_path)
|
|
|
298 |
self.current_courses = []
|
299 |
self.course_history = []
|
300 |
self.graduation_status = {}
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
def parse_transcript(self, text: str) -> Dict:
|
303 |
+
"""Parse Miami-Dade formatted transcripts with updated regex patterns."""
|
304 |
try:
|
305 |
+
# Extract student info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
306 |
student_match = re.search(
|
307 |
+
r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
|
|
|
|
|
308 |
text, re.DOTALL
|
309 |
)
|
310 |
+
|
311 |
if student_match:
|
312 |
self.student_data = {
|
313 |
+
"id": student_match.group(1).strip(),
|
314 |
+
"name": student_match.group(2).replace(",", ", ").strip(),
|
315 |
"current_grade": student_match.group(3),
|
316 |
"graduation_year": student_match.group(4),
|
317 |
"unweighted_gpa": float(student_match.group(5)),
|
|
|
319 |
"total_credits": float(student_match.group(7)),
|
320 |
"community_service_hours": int(student_match.group(8))
|
321 |
}
|
322 |
+
|
323 |
+
# Extract requirements
|
324 |
+
self.requirements = {}
|
325 |
req_section = re.search(
|
326 |
+
r"Code Description Required Waived Completed Status(.*?)Total\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+%",
|
327 |
text, re.DOTALL
|
328 |
)
|
|
|
329 |
if req_section:
|
330 |
+
req_lines = req_section.group(1).strip().splitlines()
|
331 |
+
for line in req_lines:
|
332 |
+
req_match = re.match(r"([A-Z]-[^\s]+)\s+(.+?)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+)%", line.strip())
|
333 |
+
if req_match:
|
334 |
+
code = req_match.group(1).strip()
|
335 |
+
self.requirements[code] = {
|
336 |
+
"description": req_match.group(2).strip(),
|
337 |
+
"required": float(req_match.group(3)),
|
338 |
+
"waived": float(req_match.group(4)),
|
339 |
+
"completed": float(req_match.group(5)),
|
340 |
+
"status": f"{req_match.group(6)}%"
|
341 |
+
}
|
342 |
+
|
343 |
+
# Extract course history (simplified for now)
|
344 |
+
self.course_history = []
|
345 |
+
course_pattern = re.compile(
|
346 |
+
r"([A-Z]-[^\s]+)\s+(\d{4}-\d{4}|\d{4})\s+(\d{2})\s+([A-Z0-9]+)\s+(.+?)\s+([AT12]+)\s+([A-Z0-9]+)?\s+([A-Z])?\s+([A-Z])?\s+(inProgress|\d+\.\d+)",
|
347 |
+
re.DOTALL
|
|
|
348 |
)
|
349 |
+
for match in course_pattern.finditer(text):
|
350 |
+
self.course_history.append({
|
351 |
+
"requirement_category": match.group(1),
|
352 |
+
"school_year": match.group(2),
|
353 |
+
"grade_level": match.group(3),
|
354 |
+
"course_code": match.group(4),
|
355 |
+
"description": match.group(5).strip(),
|
356 |
+
"term": match.group(6),
|
357 |
+
"district_number": match.group(7),
|
358 |
+
"grade": match.group(8),
|
359 |
+
"inclusion_status": match.group(9),
|
360 |
+
"credits": match.group(10)
|
361 |
+
})
|
362 |
+
|
363 |
+
# Extract in-progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
self._extract_current_courses()
|
365 |
self._calculate_completion()
|
366 |
+
|
367 |
return {
|
368 |
"student_info": self.student_data,
|
369 |
"requirements": self.requirements,
|
|
|
372 |
"graduation_status": self.graduation_status,
|
373 |
"format": "miami_dade"
|
374 |
}
|
|
|
|
|
|
|
|
|
375 |
|
376 |
+
except Exception as e:
|
377 |
+
logging.error(f"Error parsing transcript: {str(e)}")
|
378 |
+
raise ValueError(f"Couldn't parse transcript: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
def _extract_current_courses(self):
|
381 |
+
"""Identify in-progress courses."""
|
382 |
self.current_courses = [
|
383 |
{
|
384 |
"course": c["description"],
|
|
|
388 |
"credits": c["credits"],
|
389 |
"grade_level": c["grade_level"]
|
390 |
}
|
391 |
+
for c in self.course_history if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
|
|
|
392 |
]
|
393 |
+
|
394 |
def _calculate_completion(self):
|
395 |
+
"""Compute graduation readiness."""
|
396 |
total_required = sum(req["required"] for req in self.requirements.values())
|
397 |
total_completed = sum(req["completed"] for req in self.requirements.values())
|
398 |
+
|
399 |
self.graduation_status.update({
|
400 |
"total_required_credits": total_required,
|
401 |
"total_completed_credits": total_completed,
|
402 |
+
"percent_complete": round((total_completed / total_required) * 100, 1) if total_required > 0 else 0,
|
403 |
"remaining_credits": total_required - total_completed,
|
404 |
+
"on_track": (total_completed / total_required) >= 0.75 if total_required > 0 else False
|
405 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
406 |
|
407 |
def format_transcript_output(data: Dict) -> str:
|
408 |
"""Enhanced formatting for transcript output with format awareness"""
|