Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -338,6 +338,23 @@ class GraduationProgress(BaseModel):
|
|
338 |
courses: List[Course]
|
339 |
assessments: Dict[str, str]
|
340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
class TranscriptParser:
|
342 |
def __init__(self):
|
343 |
self.student_data = {}
|
@@ -351,20 +368,25 @@ class TranscriptParser:
|
|
351 |
try:
|
352 |
text = preprocess_text(text)
|
353 |
|
354 |
-
# First try the
|
355 |
-
parsed_data = self.
|
356 |
if parsed_data:
|
|
|
357 |
return parsed_data
|
358 |
-
|
359 |
# Fall back to simplified parser if detailed parsing fails
|
360 |
-
|
|
|
|
|
|
|
|
|
361 |
|
362 |
except Exception as e:
|
363 |
logging.error(f"Error parsing transcript: {str(e)}")
|
364 |
raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
|
365 |
|
366 |
-
def
|
367 |
-
"""
|
368 |
try:
|
369 |
parsed_data = {
|
370 |
'student_info': {},
|
@@ -385,7 +407,7 @@ class TranscriptParser:
|
|
385 |
parsed_data['student_info']['grade'] = student_info_match.group(3)
|
386 |
parsed_data['student_info']['year_of_graduation'] = student_info_match.group(4)
|
387 |
|
388 |
-
#
|
389 |
gpa_matches = re.findall(
|
390 |
r"(?:Un.?weighted|Weighted)\s*GPA\s*([\d.]+)",
|
391 |
text,
|
@@ -415,46 +437,56 @@ class TranscriptParser:
|
|
415 |
if virtual_grade_match:
|
416 |
parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
|
417 |
|
418 |
-
# Extract requirements
|
419 |
-
req_section = re.search(
|
|
|
|
|
|
|
|
|
|
|
420 |
if req_section:
|
421 |
req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
|
422 |
for line in req_lines:
|
423 |
if '|' in line: # Table format
|
424 |
-
parts = [part.strip() for part in line.split('|')]
|
425 |
-
if len(parts) >=
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
|
|
438 |
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
|
|
|
|
|
|
447 |
|
448 |
-
# Extract assessments
|
449 |
-
assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
|
450 |
if assess_section:
|
451 |
assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
|
452 |
for line in assess_lines:
|
453 |
if '|' in line:
|
454 |
-
parts = [part.strip() for part in line.split('|')]
|
455 |
if len(parts) >= 5 and parts[0].startswith('Z-'):
|
456 |
name = parts[0].replace('Z-', '').strip()
|
457 |
-
status = parts[4]
|
458 |
parsed_data['assessments'][name] = status
|
459 |
|
460 |
# Extract course history with more fault-tolerant parsing
|
@@ -471,10 +503,10 @@ class TranscriptParser:
|
|
471 |
]
|
472 |
|
473 |
for line in course_lines:
|
474 |
-
parts = [part.strip() for part in line.split('|')]
|
475 |
|
476 |
-
#
|
477 |
-
|
478 |
course = {
|
479 |
'requirement': parts[0] if len(parts) > 0 else "",
|
480 |
'school_year': parts[1] if len(parts) > 1 else "",
|
@@ -489,17 +521,20 @@ class TranscriptParser:
|
|
489 |
}
|
490 |
|
491 |
# Handle "inProgress" and empty credits
|
492 |
-
if "
|
493 |
course['credits'] = "0"
|
494 |
elif not course['credits'].replace('.','').isdigit():
|
495 |
course['credits'] = "0"
|
496 |
|
497 |
parsed_data['course_history'].append(course)
|
|
|
|
|
|
|
498 |
|
499 |
return parsed_data
|
500 |
|
501 |
except Exception as e:
|
502 |
-
logging.warning(f"
|
503 |
return None
|
504 |
|
505 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
@@ -544,7 +579,7 @@ class TranscriptParser:
|
|
544 |
logging.warning(f"Pattern {pattern} failed: {str(e)}")
|
545 |
continue
|
546 |
|
547 |
-
|
548 |
|
549 |
# ========== ENHANCED ANALYSIS FUNCTIONS ==========
|
550 |
def analyze_gpa(parsed_data: Dict) -> str:
|
@@ -955,11 +990,11 @@ class LearningStyleQuiz:
|
|
955 |
result += "You may benefit from combining different learning approaches:\n"
|
956 |
for style in primary_styles:
|
957 |
result += f"\n**{style}** techniques:\n"
|
958 |
-
for tip in
|
959 |
result += f"- {tip}\n"
|
960 |
|
961 |
result += f"\n**{style}** career suggestions:\n"
|
962 |
-
for career in
|
963 |
result += f"- {career}\n"
|
964 |
|
965 |
return result
|
|
|
338 |
courses: List[Course]
|
339 |
assessments: Dict[str, str]
|
340 |
|
341 |
+
def validate_parsed_data(parsed_data: Dict) -> bool:
|
342 |
+
"""Ensure all critical fields exist"""
|
343 |
+
required_fields = [
|
344 |
+
('student_info', 'name'),
|
345 |
+
('student_info', 'weighted_gpa'),
|
346 |
+
('requirements', 'A-English'), # Sample requirement
|
347 |
+
('course_history', 0) # At least one course
|
348 |
+
]
|
349 |
+
|
350 |
+
for path in required_fields:
|
351 |
+
current = parsed_data
|
352 |
+
for key in path:
|
353 |
+
if key not in current:
|
354 |
+
raise ValueError(f"Missing critical field: {'.'.join(path)}")
|
355 |
+
current = current[key]
|
356 |
+
return True
|
357 |
+
|
358 |
class TranscriptParser:
|
359 |
def __init__(self):
|
360 |
self.student_data = {}
|
|
|
368 |
try:
|
369 |
text = preprocess_text(text)
|
370 |
|
371 |
+
# First try the specialized Miami-Dade parser
|
372 |
+
parsed_data = self._parse_miami_dade_transcript(text)
|
373 |
if parsed_data:
|
374 |
+
validate_parsed_data(parsed_data)
|
375 |
return parsed_data
|
376 |
+
|
377 |
# Fall back to simplified parser if detailed parsing fails
|
378 |
+
parsed_data = self._parse_simplified_transcript(text)
|
379 |
+
if parsed_data:
|
380 |
+
return parsed_data
|
381 |
+
|
382 |
+
raise ValueError("No data could be parsed from the transcript")
|
383 |
|
384 |
except Exception as e:
|
385 |
logging.error(f"Error parsing transcript: {str(e)}")
|
386 |
raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
|
387 |
|
388 |
+
def _parse_miami_dade_transcript(self, text: str) -> Optional[Dict]:
|
389 |
+
"""Specialized parser for Miami-Dade County Public Schools transcripts"""
|
390 |
try:
|
391 |
parsed_data = {
|
392 |
'student_info': {},
|
|
|
407 |
parsed_data['student_info']['grade'] = student_info_match.group(3)
|
408 |
parsed_data['student_info']['year_of_graduation'] = student_info_match.group(4)
|
409 |
|
410 |
+
# Extract GPA information
|
411 |
gpa_matches = re.findall(
|
412 |
r"(?:Un.?weighted|Weighted)\s*GPA\s*([\d.]+)",
|
413 |
text,
|
|
|
437 |
if virtual_grade_match:
|
438 |
parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
|
439 |
|
440 |
+
# Extract requirements section - more robust table parsing
|
441 |
+
req_section = re.search(
|
442 |
+
r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)(?:\n\s*\n|$)",
|
443 |
+
text,
|
444 |
+
re.DOTALL | re.IGNORECASE
|
445 |
+
)
|
446 |
+
|
447 |
if req_section:
|
448 |
req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
|
449 |
for line in req_lines:
|
450 |
if '|' in line: # Table format
|
451 |
+
parts = [part.strip() for part in line.split('|') if part.strip()]
|
452 |
+
if len(parts) >= 5: # More lenient check for number of columns
|
453 |
+
try:
|
454 |
+
code = parts[0] if len(parts) > 0 else ""
|
455 |
+
description = parts[1] if len(parts) > 1 else ""
|
456 |
+
required = float(parts[2]) if len(parts) > 2 and parts[2].replace('.','').isdigit() else 0.0
|
457 |
+
waived = float(parts[3]) if len(parts) > 3 and parts[3].replace('.','').isdigit() else 0.0
|
458 |
+
completed = float(parts[4]) if len(parts) > 4 and parts[4].replace('.','').isdigit() else 0.0
|
459 |
+
status = parts[5] if len(parts) > 5 else ""
|
460 |
+
|
461 |
+
# Extract percentage if available
|
462 |
+
percent = 0.0
|
463 |
+
if status:
|
464 |
+
percent_match = re.search(r"(\d+)%", status)
|
465 |
+
if percent_match:
|
466 |
+
percent = float(percent_match.group(1))
|
467 |
|
468 |
+
parsed_data['requirements'][code] = {
|
469 |
+
"description": description,
|
470 |
+
"required": required,
|
471 |
+
"waived": waived,
|
472 |
+
"completed": completed,
|
473 |
+
"percent_complete": percent,
|
474 |
+
"status": status
|
475 |
+
}
|
476 |
+
except (IndexError, ValueError) as e:
|
477 |
+
logging.warning(f"Skipping malformed requirement line: {line}. Error: {str(e)}")
|
478 |
+
continue
|
479 |
|
480 |
+
# Extract assessments section
|
481 |
+
assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
|
482 |
if assess_section:
|
483 |
assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
|
484 |
for line in assess_lines:
|
485 |
if '|' in line:
|
486 |
+
parts = [part.strip() for part in line.split('|') if part.strip()]
|
487 |
if len(parts) >= 5 and parts[0].startswith('Z-'):
|
488 |
name = parts[0].replace('Z-', '').strip()
|
489 |
+
status = parts[4] if len(parts) > 4 else ""
|
490 |
parsed_data['assessments'][name] = status
|
491 |
|
492 |
# Extract course history with more fault-tolerant parsing
|
|
|
503 |
]
|
504 |
|
505 |
for line in course_lines:
|
506 |
+
parts = [part.strip() for part in line.split('|') if part.strip()]
|
507 |
|
508 |
+
# More robust handling of course data
|
509 |
+
try:
|
510 |
course = {
|
511 |
'requirement': parts[0] if len(parts) > 0 else "",
|
512 |
'school_year': parts[1] if len(parts) > 1 else "",
|
|
|
521 |
}
|
522 |
|
523 |
# Handle "inProgress" and empty credits
|
524 |
+
if "inprogress" in course['credits'].lower() or not course['credits']:
|
525 |
course['credits'] = "0"
|
526 |
elif not course['credits'].replace('.','').isdigit():
|
527 |
course['credits'] = "0"
|
528 |
|
529 |
parsed_data['course_history'].append(course)
|
530 |
+
except (IndexError, ValueError) as e:
|
531 |
+
logging.warning(f"Skipping malformed course line: {line}. Error: {str(e)}")
|
532 |
+
continue
|
533 |
|
534 |
return parsed_data
|
535 |
|
536 |
except Exception as e:
|
537 |
+
logging.warning(f"Miami-Dade transcript parsing failed: {str(e)}")
|
538 |
return None
|
539 |
|
540 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
|
|
579 |
logging.warning(f"Pattern {pattern} failed: {str(e)}")
|
580 |
continue
|
581 |
|
582 |
+
return None
|
583 |
|
584 |
# ========== ENHANCED ANALYSIS FUNCTIONS ==========
|
585 |
def analyze_gpa(parsed_data: Dict) -> str:
|
|
|
990 |
result += "You may benefit from combining different learning approaches:\n"
|
991 |
for style in primary_styles:
|
992 |
result += f"\n**{style}** techniques:\n"
|
993 |
+
for tip in style_info['tips'][:2]:
|
994 |
result += f"- {tip}\n"
|
995 |
|
996 |
result += f"\n**{style}** career suggestions:\n"
|
997 |
+
for career in style_info['careers'][:3]:
|
998 |
result += f"- {career}\n"
|
999 |
|
1000 |
return result
|