Dannyar608 commited on
Commit
acac7a6
·
verified ·
1 Parent(s): ebc14af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -43
app.py CHANGED
@@ -338,6 +338,23 @@ class GraduationProgress(BaseModel):
338
  courses: List[Course]
339
  assessments: Dict[str, str]
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  class TranscriptParser:
342
  def __init__(self):
343
  self.student_data = {}
@@ -351,20 +368,25 @@ class TranscriptParser:
351
  try:
352
  text = preprocess_text(text)
353
 
354
- # First try the new detailed parser
355
- parsed_data = self._parse_detailed_transcript(text)
356
  if parsed_data:
 
357
  return parsed_data
358
-
359
  # Fall back to simplified parser if detailed parsing fails
360
- return self._parse_simplified_transcript(text)
 
 
 
 
361
 
362
  except Exception as e:
363
  logging.error(f"Error parsing transcript: {str(e)}")
364
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
365
 
366
- def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
367
- """Parse detailed transcript format with improved patterns for Miami-Dade format"""
368
  try:
369
  parsed_data = {
370
  'student_info': {},
@@ -385,7 +407,7 @@ class TranscriptParser:
385
  parsed_data['student_info']['grade'] = student_info_match.group(3)
386
  parsed_data['student_info']['year_of_graduation'] = student_info_match.group(4)
387
 
388
- # More robust GPA extraction
389
  gpa_matches = re.findall(
390
  r"(?:Un.?weighted|Weighted)\s*GPA\s*([\d.]+)",
391
  text,
@@ -415,46 +437,56 @@ class TranscriptParser:
415
  if virtual_grade_match:
416
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
417
 
418
- # Extract requirements
419
- req_section = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
 
 
 
 
 
420
  if req_section:
421
  req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
422
  for line in req_lines:
423
  if '|' in line: # Table format
424
- parts = [part.strip() for part in line.split('|')]
425
- if len(parts) >= 6:
426
- code = parts[0]
427
- description = parts[1]
428
- required = float(parts[2]) if parts[2] and parts[2].replace('.','').isdigit() else 0.0
429
- waived = float(parts[3]) if parts[3] and parts[3].replace('.','').isdigit() else 0.0
430
- completed = float(parts[4]) if parts[4] and parts[4].replace('.','').isdigit() else 0.0
431
- status = parts[5]
432
-
433
- # Extract percentage if available
434
- percent = 0.0
435
- percent_match = re.search(r"(\d+)%", status)
436
- if percent_match:
437
- percent = float(percent_match.group(1))
 
 
438
 
439
- parsed_data['requirements'][code] = {
440
- "description": description,
441
- "required": required,
442
- "waived": waived,
443
- "completed": completed,
444
- "percent_complete": percent,
445
- "status": status
446
- }
 
 
 
447
 
448
- # Extract assessments
449
- assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
450
  if assess_section:
451
  assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
452
  for line in assess_lines:
453
  if '|' in line:
454
- parts = [part.strip() for part in line.split('|')]
455
  if len(parts) >= 5 and parts[0].startswith('Z-'):
456
  name = parts[0].replace('Z-', '').strip()
457
- status = parts[4]
458
  parsed_data['assessments'][name] = status
459
 
460
  # Extract course history with more fault-tolerant parsing
@@ -471,10 +503,10 @@ class TranscriptParser:
471
  ]
472
 
473
  for line in course_lines:
474
- parts = [part.strip() for part in line.split('|')]
475
 
476
- # Handle varying number of columns
477
- if len(parts) >= 9:
478
  course = {
479
  'requirement': parts[0] if len(parts) > 0 else "",
480
  'school_year': parts[1] if len(parts) > 1 else "",
@@ -489,17 +521,20 @@ class TranscriptParser:
489
  }
490
 
491
  # Handle "inProgress" and empty credits
492
- if "inProgress" in course['credits'].lower() or not course['credits']:
493
  course['credits'] = "0"
494
  elif not course['credits'].replace('.','').isdigit():
495
  course['credits'] = "0"
496
 
497
  parsed_data['course_history'].append(course)
 
 
 
498
 
499
  return parsed_data
500
 
501
  except Exception as e:
502
- logging.warning(f"Detailed transcript parsing failed: {str(e)}")
503
  return None
504
 
505
  def _parse_simplified_transcript(self, text: str) -> Dict:
@@ -544,7 +579,7 @@ class TranscriptParser:
544
  logging.warning(f"Pattern {pattern} failed: {str(e)}")
545
  continue
546
 
547
- raise ValueError("Could not identify course information in transcript")
548
 
549
  # ========== ENHANCED ANALYSIS FUNCTIONS ==========
550
  def analyze_gpa(parsed_data: Dict) -> str:
@@ -955,11 +990,11 @@ class LearningStyleQuiz:
955
  result += "You may benefit from combining different learning approaches:\n"
956
  for style in primary_styles:
957
  result += f"\n**{style}** techniques:\n"
958
- for tip in self.learning_styles[style]['tips'][:2]:
959
  result += f"- {tip}\n"
960
 
961
  result += f"\n**{style}** career suggestions:\n"
962
- for career in self.learning_styles[style]['careers'][:3]:
963
  result += f"- {career}\n"
964
 
965
  return result
 
338
  courses: List[Course]
339
  assessments: Dict[str, str]
340
 
341
+ def validate_parsed_data(parsed_data: Dict) -> bool:
342
+ """Ensure all critical fields exist"""
343
+ required_fields = [
344
+ ('student_info', 'name'),
345
+ ('student_info', 'weighted_gpa'),
346
+ ('requirements', 'A-English'), # Sample requirement
347
+ ('course_history', 0) # At least one course
348
+ ]
349
+
350
+ for path in required_fields:
351
+ current = parsed_data
352
+ for key in path:
353
+ if key not in current:
354
+ raise ValueError(f"Missing critical field: {'.'.join(path)}")
355
+ current = current[key]
356
+ return True
357
+
358
  class TranscriptParser:
359
  def __init__(self):
360
  self.student_data = {}
 
368
  try:
369
  text = preprocess_text(text)
370
 
371
+ # First try the specialized Miami-Dade parser
372
+ parsed_data = self._parse_miami_dade_transcript(text)
373
  if parsed_data:
374
+ validate_parsed_data(parsed_data)
375
  return parsed_data
376
+
377
  # Fall back to simplified parser if detailed parsing fails
378
+ parsed_data = self._parse_simplified_transcript(text)
379
+ if parsed_data:
380
+ return parsed_data
381
+
382
+ raise ValueError("No data could be parsed from the transcript")
383
 
384
  except Exception as e:
385
  logging.error(f"Error parsing transcript: {str(e)}")
386
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
387
 
388
+ def _parse_miami_dade_transcript(self, text: str) -> Optional[Dict]:
389
+ """Specialized parser for Miami-Dade County Public Schools transcripts"""
390
  try:
391
  parsed_data = {
392
  'student_info': {},
 
407
  parsed_data['student_info']['grade'] = student_info_match.group(3)
408
  parsed_data['student_info']['year_of_graduation'] = student_info_match.group(4)
409
 
410
+ # Extract GPA information
411
  gpa_matches = re.findall(
412
  r"(?:Un.?weighted|Weighted)\s*GPA\s*([\d.]+)",
413
  text,
 
437
  if virtual_grade_match:
438
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
439
 
440
+ # Extract requirements section - more robust table parsing
441
+ req_section = re.search(
442
+ r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)(?:\n\s*\n|$)",
443
+ text,
444
+ re.DOTALL | re.IGNORECASE
445
+ )
446
+
447
  if req_section:
448
  req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
449
  for line in req_lines:
450
  if '|' in line: # Table format
451
+ parts = [part.strip() for part in line.split('|') if part.strip()]
452
+ if len(parts) >= 5: # More lenient check for number of columns
453
+ try:
454
+ code = parts[0] if len(parts) > 0 else ""
455
+ description = parts[1] if len(parts) > 1 else ""
456
+ required = float(parts[2]) if len(parts) > 2 and parts[2].replace('.','').isdigit() else 0.0
457
+ waived = float(parts[3]) if len(parts) > 3 and parts[3].replace('.','').isdigit() else 0.0
458
+ completed = float(parts[4]) if len(parts) > 4 and parts[4].replace('.','').isdigit() else 0.0
459
+ status = parts[5] if len(parts) > 5 else ""
460
+
461
+ # Extract percentage if available
462
+ percent = 0.0
463
+ if status:
464
+ percent_match = re.search(r"(\d+)%", status)
465
+ if percent_match:
466
+ percent = float(percent_match.group(1))
467
 
468
+ parsed_data['requirements'][code] = {
469
+ "description": description,
470
+ "required": required,
471
+ "waived": waived,
472
+ "completed": completed,
473
+ "percent_complete": percent,
474
+ "status": status
475
+ }
476
+ except (IndexError, ValueError) as e:
477
+ logging.warning(f"Skipping malformed requirement line: {line}. Error: {str(e)}")
478
+ continue
479
 
480
+ # Extract assessments section
481
+ assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
482
  if assess_section:
483
  assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
484
  for line in assess_lines:
485
  if '|' in line:
486
+ parts = [part.strip() for part in line.split('|') if part.strip()]
487
  if len(parts) >= 5 and parts[0].startswith('Z-'):
488
  name = parts[0].replace('Z-', '').strip()
489
+ status = parts[4] if len(parts) > 4 else ""
490
  parsed_data['assessments'][name] = status
491
 
492
  # Extract course history with more fault-tolerant parsing
 
503
  ]
504
 
505
  for line in course_lines:
506
+ parts = [part.strip() for part in line.split('|') if part.strip()]
507
 
508
+ # More robust handling of course data
509
+ try:
510
  course = {
511
  'requirement': parts[0] if len(parts) > 0 else "",
512
  'school_year': parts[1] if len(parts) > 1 else "",
 
521
  }
522
 
523
  # Handle "inProgress" and empty credits
524
+ if "inprogress" in course['credits'].lower() or not course['credits']:
525
  course['credits'] = "0"
526
  elif not course['credits'].replace('.','').isdigit():
527
  course['credits'] = "0"
528
 
529
  parsed_data['course_history'].append(course)
530
+ except (IndexError, ValueError) as e:
531
+ logging.warning(f"Skipping malformed course line: {line}. Error: {str(e)}")
532
+ continue
533
 
534
  return parsed_data
535
 
536
  except Exception as e:
537
+ logging.warning(f"Miami-Dade transcript parsing failed: {str(e)}")
538
  return None
539
 
540
  def _parse_simplified_transcript(self, text: str) -> Dict:
 
579
  logging.warning(f"Pattern {pattern} failed: {str(e)}")
580
  continue
581
 
582
+ return None
583
 
584
  # ========== ENHANCED ANALYSIS FUNCTIONS ==========
585
  def analyze_gpa(parsed_data: Dict) -> str:
 
990
  result += "You may benefit from combining different learning approaches:\n"
991
  for style in primary_styles:
992
  result += f"\n**{style}** techniques:\n"
993
+ for tip in style_info['tips'][:2]:
994
  result += f"- {tip}\n"
995
 
996
  result += f"\n**{style}** career suggestions:\n"
997
+ for career in style_info['careers'][:3]:
998
  result += f"- {career}\n"
999
 
1000
  return result