Dannyar608 commited on
Commit
33513a8
Β·
verified Β·
1 Parent(s): dc1d757

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -52
app.py CHANGED
@@ -66,6 +66,15 @@ class ModelLoader:
66
 
67
  def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
68
  """Lazy load the model with progress feedback"""
 
 
 
 
 
 
 
 
 
69
  try:
70
  if progress:
71
  progress(0.1, desc="Checking GPU availability...")
@@ -117,6 +126,8 @@ class ModelLoader:
117
  self.error = f"Model loading failed: {str(e)}"
118
  logging.error(self.error)
119
  return None, None
 
 
120
 
121
  # Initialize model loader
122
  model_loader = ModelLoader()
@@ -170,6 +181,13 @@ def validate_file(file_obj) -> None:
170
  raise ValueError(f"File too large. Maximum size is {MAX_FILE_SIZE_MB}MB.")
171
 
172
  # ========== TEXT EXTRACTION FUNCTIONS ==========
 
 
 
 
 
 
 
173
  def extract_text_from_file(file_path: str, file_ext: str) -> str:
174
  text = ""
175
 
@@ -312,6 +330,8 @@ class TranscriptParser:
312
  def parse_transcript(self, text: str) -> Dict:
313
  """Parse transcript text and return structured data"""
314
  try:
 
 
315
  # First try the new detailed parser
316
  parsed_data = self._parse_detailed_transcript(text)
317
  if parsed_data:
@@ -349,28 +369,29 @@ class TranscriptParser:
349
  if yog_match:
350
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
351
 
352
- # Improved GPA extraction
353
- gpa_matches = re.findall(r"(?:Un-weighted|Weighted)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
354
- if len(gpa_matches) >= 2:
355
  parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
 
356
  parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
357
 
358
  # Community service info
359
- service_hours_match = re.search(r"Comm\s*Serv\s*Hours\s*(\d+)", text, re.IGNORECASE)
360
  if service_hours_match:
361
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
362
 
363
- service_date_match = re.search(r"Comm\s*Serv\s*Date\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
364
  if service_date_match:
365
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
366
 
367
  # Credits info
368
- credits_match = re.search(r"Total\s*Credits\s*Earned\s*([\d.]+)", text, re.IGNORECASE)
369
  if credits_match:
370
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
371
 
372
  # Virtual grade
373
- virtual_grade_match = re.search(r"Virtual\s*Grade\s*(\w+)", text, re.IGNORECASE)
374
  if virtual_grade_match:
375
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
376
 
@@ -379,10 +400,10 @@ class TranscriptParser:
379
  for match in req_pattern.finditer(text):
380
  code = match.group(1).strip()
381
  desc = match.group(2).strip()
382
- required = float(match.group(3))
383
- waived = float(match.group(4))
384
- completed = float(match.group(5))
385
- percent = float(match.group(6))
386
  parsed_data['requirements'][code] = {
387
  "description": desc,
388
  "required": required,
@@ -392,7 +413,7 @@ class TranscriptParser:
392
  }
393
 
394
  # Extract assessments with more flexible pattern
395
- assess_pattern = re.compile(r"Z-Assessment:\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%")
396
  for match in assess_pattern.finditer(text):
397
  name = f"Assessment: {match.group(1).strip()}"
398
  status = match.group(3).strip()
@@ -406,22 +427,22 @@ class TranscriptParser:
406
  parsed_data['assessments'][z_item] = status
407
 
408
  # Extract course history with more robust pattern
409
- course_history_section = re.search(r"Requirement.*?School Year.*?GradeLv1.*?CrsNum.*?Description.*?Term.*?DstNumber.*?FG.*?Incl.*?Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
410
  if course_history_section:
411
  course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip()]
412
  for line in course_lines:
413
  parts = [part.strip() for part in line.split('|')]
414
  if len(parts) >= 9:
415
  course = {
416
- 'requirement': parts[0],
417
- 'school_year': parts[1],
418
- 'grade_level': parts[2],
419
- 'course_code': parts[3],
420
- 'description': parts[4],
421
- 'term': parts[5],
422
- 'district_number': parts[6],
423
- 'fg': parts[7],
424
- 'included': parts[8],
425
  'credits': parts[9] if len(parts) > 9 else "0"
426
  }
427
  parsed_data['course_history'].append(course)
@@ -435,7 +456,7 @@ class TranscriptParser:
435
  def _parse_simplified_transcript(self, text: str) -> Dict:
436
  """Fallback simplified transcript parser with multiple pattern attempts"""
437
  patterns = [
438
- (r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
439
  (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
440
  (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
441
  ]
@@ -444,8 +465,10 @@ class TranscriptParser:
444
  try:
445
  if pattern_type == 'table':
446
  # Parse tabular data
447
- courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
448
- re.search(pattern, text, re.DOTALL).group(1))
 
 
449
  elif pattern_type == 'line':
450
  courses = re.findall(pattern, text)
451
  else:
@@ -454,14 +477,22 @@ class TranscriptParser:
454
  if courses:
455
  parsed_data = {'course_history': []}
456
  for course in courses:
457
- parsed_data['course_history'].append({
458
- 'course_code': course[0].strip(),
459
- 'description': course[1].strip() if len(course) > 1 else '',
460
- 'grade': course[2].strip() if len(course) > 2 else '',
461
- 'credits': float(course[3]) if len(course) > 3 else 0.0
462
- })
 
 
 
 
 
 
 
463
  return parsed_data
464
- except:
 
465
  continue
466
 
467
  raise ValueError("Could not identify course information in transcript")
@@ -469,7 +500,7 @@ class TranscriptParser:
469
  # ========== ENHANCED ANALYSIS FUNCTIONS ==========
470
  def analyze_gpa(parsed_data: Dict) -> str:
471
  try:
472
- gpa = float(parsed_data['student_info']['weighted_gpa'])
473
  if gpa >= 4.5:
474
  return "🌟 Excellent GPA! You're in the top tier of students."
475
  elif gpa >= 3.5:
@@ -484,15 +515,15 @@ def analyze_gpa(parsed_data: Dict) -> str:
484
  def analyze_graduation_status(parsed_data: Dict) -> str:
485
  try:
486
  total_required = sum(
487
- float(req['required'])
488
- for req in parsed_data['requirements'].values()
489
- if req.get('required') and str(req['required']).replace('.', '').isdigit()
490
  )
491
 
492
  total_completed = sum(
493
- float(req['completed'])
494
- for req in parsed_data['requirements'].values()
495
- if req.get('completed') and str(req['completed']).replace('.', '').isdigit()
496
  )
497
 
498
  completion_percentage = (total_completed / total_required) * 100 if total_required > 0 else 0
@@ -513,7 +544,7 @@ def generate_advice(parsed_data: Dict) -> str:
513
 
514
  # GPA advice
515
  try:
516
- gpa = float(parsed_data['student_info']['weighted_gpa'])
517
  if gpa < 3.0:
518
  advice.append("πŸ“š Your GPA could improve. Consider:\n- Seeking tutoring for challenging subjects\n- Meeting with teachers during office hours\n- Developing better study habits")
519
  except (TypeError, ValueError, KeyError, AttributeError):
@@ -521,7 +552,7 @@ def generate_advice(parsed_data: Dict) -> str:
521
 
522
  # Community service advice
523
  try:
524
- service_hours = int(parsed_data['student_info']['community_service_hours'])
525
  if service_hours < 100:
526
  advice.append("🀝 Consider more community service:\n- Many colleges value 100+ hours\n- Look for opportunities that align with your interests")
527
  except (TypeError, ValueError, KeyError, AttributeError):
@@ -530,19 +561,20 @@ def generate_advice(parsed_data: Dict) -> str:
530
  # Missing requirements advice
531
  try:
532
  missing_reqs = [
533
- req for code, req in parsed_data['requirements'].items()
534
- if float(req['percent_complete']) < 100 and not code.startswith("Z-Assessment")
535
  ]
536
 
537
  if missing_reqs:
538
- req_list = "\n- ".join([f"{code}: {req['description']}" for code, req in missing_reqs])
539
  advice.append(f"πŸŽ“ Focus on completing these requirements:\n- {req_list}")
540
  except (TypeError, ValueError, KeyError, AttributeError):
541
  pass
542
 
543
  # Course rigor advice
544
  try:
545
- ap_count = sum(1 for course in parsed_data['course_history'] if "Advanced Placement" in course['description'])
 
546
  if ap_count < 3:
547
  advice.append("🧠 Consider taking more challenging courses:\n- AP/IB courses can strengthen college applications\n- Shows academic rigor to admissions officers")
548
  except (TypeError, KeyError, AttributeError):
@@ -552,9 +584,10 @@ def generate_advice(parsed_data: Dict) -> str:
552
 
553
  def generate_college_recommendations(parsed_data: Dict) -> str:
554
  try:
555
- gpa = float(parsed_data['student_info']['weighted_gpa'])
556
- ap_count = sum(1 for course in parsed_data['course_history'] if "Advanced Placement" in course['description'])
557
- service_hours = int(parsed_data['student_info']['community_service_hours']) if parsed_data['student_info'].get('community_service_hours') else 0
 
558
 
559
  recommendations = []
560
 
@@ -589,8 +622,8 @@ def create_gpa_visualization(parsed_data: Dict):
589
  gpa_data = {
590
  "Type": ["Weighted GPA", "Unweighted GPA"],
591
  "Value": [
592
- float(parsed_data['student_info']['weighted_gpa']),
593
- float(parsed_data['student_info']['unweighted_gpa'])
594
  ]
595
  }
596
  df = pd.DataFrame(gpa_data)
@@ -606,8 +639,8 @@ def create_gpa_visualization(parsed_data: Dict):
606
  def create_requirements_visualization(parsed_data: Dict):
607
  try:
608
  req_data = []
609
- for code, req in parsed_data['requirements'].items():
610
- if req.get('percent_complete'):
611
  completion = float(req['percent_complete'])
612
  req_data.append({
613
  "Requirement": code,
@@ -663,6 +696,8 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
663
  parser = TranscriptParser()
664
  try:
665
  parsed_data = parser.parse_transcript(text)
 
 
666
  except Exception as e:
667
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
668
 
 
66
 
67
  def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
68
  """Lazy load the model with progress feedback"""
69
+ if self.loaded:
70
+ return self.model, self.tokenizer
71
+
72
+ if self.loading:
73
+ while self.loading:
74
+ time.sleep(0.1)
75
+ return self.model, self.tokenizer
76
+
77
+ self.loading = True
78
  try:
79
  if progress:
80
  progress(0.1, desc="Checking GPU availability...")
 
126
  self.error = f"Model loading failed: {str(e)}"
127
  logging.error(self.error)
128
  return None, None
129
+ finally:
130
+ self.loading = False
131
 
132
  # Initialize model loader
133
  model_loader = ModelLoader()
 
181
  raise ValueError(f"File too large. Maximum size is {MAX_FILE_SIZE_MB}MB.")
182
 
183
  # ========== TEXT EXTRACTION FUNCTIONS ==========
184
+ def preprocess_text(text: str) -> str:
185
+ """Normalize text for more reliable parsing"""
186
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
187
+ text = text.replace('|', ' ') # Handle common OCR errors
188
+ text = text.upper() # Standardize case for certain fields
189
+ return text
190
+
191
  def extract_text_from_file(file_path: str, file_ext: str) -> str:
192
  text = ""
193
 
 
330
  def parse_transcript(self, text: str) -> Dict:
331
  """Parse transcript text and return structured data"""
332
  try:
333
+ text = preprocess_text(text)
334
+
335
  # First try the new detailed parser
336
  parsed_data = self._parse_detailed_transcript(text)
337
  if parsed_data:
 
369
  if yog_match:
370
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
371
 
372
+ # Improved GPA extraction with more flexible patterns
373
+ gpa_matches = re.findall(r"(?:UNWEIGHTED|WEIGHTED)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
374
+ if len(gpa_matches) >= 1:
375
  parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
376
+ if len(gpa_matches) >= 2:
377
  parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
378
 
379
  # Community service info
380
+ service_hours_match = re.search(r"COMM\s*SERV\s*HOURS\s*(\d+)", text, re.IGNORECASE)
381
  if service_hours_match:
382
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
383
 
384
+ service_date_match = re.search(r"COMM\s*SERV\s*DATE\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
385
  if service_date_match:
386
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
387
 
388
  # Credits info
389
+ credits_match = re.search(r"TOTAL\s*CREDITS\s*EARNED\s*([\d.]+)", text, re.IGNORECASE)
390
  if credits_match:
391
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
392
 
393
  # Virtual grade
394
+ virtual_grade_match = re.search(r"VIRTUAL\s*GRADE\s*(\w+)", text, re.IGNORECASE)
395
  if virtual_grade_match:
396
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
397
 
 
400
  for match in req_pattern.finditer(text):
401
  code = match.group(1).strip()
402
  desc = match.group(2).strip()
403
+ required = float(match.group(3)) if match.group(3) else 0.0
404
+ waived = float(match.group(4)) if match.group(4) else 0.0
405
+ completed = float(match.group(5)) if match.group(5) else 0.0
406
+ percent = float(match.group(6)) if match.group(6) else 0.0
407
  parsed_data['requirements'][code] = {
408
  "description": desc,
409
  "required": required,
 
413
  }
414
 
415
  # Extract assessments with more flexible pattern
416
+ assess_pattern = re.compile(r"Z-ASSESSMENT:\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%", re.IGNORECASE)
417
  for match in assess_pattern.finditer(text):
418
  name = f"Assessment: {match.group(1).strip()}"
419
  status = match.group(3).strip()
 
427
  parsed_data['assessments'][z_item] = status
428
 
429
  # Extract course history with more robust pattern
430
+ course_history_section = re.search(r"REQUIREMENT.*?SCHOOL YEAR.*?GRADELV1.*?CRSNUM.*?DESCRIPTION.*?TERM.*?DSTNUMBER.*?FG.*?INCL.*?CREDITS(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
431
  if course_history_section:
432
  course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip()]
433
  for line in course_lines:
434
  parts = [part.strip() for part in line.split('|')]
435
  if len(parts) >= 9:
436
  course = {
437
+ 'requirement': parts[0] if len(parts) > 0 else "",
438
+ 'school_year': parts[1] if len(parts) > 1 else "",
439
+ 'grade_level': parts[2] if len(parts) > 2 else "",
440
+ 'course_code': parts[3] if len(parts) > 3 else "",
441
+ 'description': parts[4] if len(parts) > 4 else "",
442
+ 'term': parts[5] if len(parts) > 5 else "",
443
+ 'district_number': parts[6] if len(parts) > 6 else "",
444
+ 'fg': parts[7] if len(parts) > 7 else "",
445
+ 'included': parts[8] if len(parts) > 8 else "",
446
  'credits': parts[9] if len(parts) > 9 else "0"
447
  }
448
  parsed_data['course_history'].append(course)
 
456
  def _parse_simplified_transcript(self, text: str) -> Dict:
457
  """Fallback simplified transcript parser with multiple pattern attempts"""
458
  patterns = [
459
+ (r'(?:COURSE|SUBJECT)\s*CODE.*?GRADE.*?CREDITS(.*?)(?:\n\s*\n|\Z)', 'table'),
460
  (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
461
  (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
462
  ]
 
465
  try:
466
  if pattern_type == 'table':
467
  # Parse tabular data
468
+ table_section = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
469
+ if table_section:
470
+ courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
471
+ table_section.group(1))
472
  elif pattern_type == 'line':
473
  courses = re.findall(pattern, text)
474
  else:
 
477
  if courses:
478
  parsed_data = {'course_history': []}
479
  for course in courses:
480
+ if len(course) >= 4:
481
+ parsed_data['course_history'].append({
482
+ 'course_code': course[0].strip(),
483
+ 'description': course[1].strip(),
484
+ 'grade': course[2].strip(),
485
+ 'credits': float(course[3]) if course[3] else 0.0
486
+ })
487
+ elif len(course) == 3:
488
+ parsed_data['course_history'].append({
489
+ 'description': course[0].strip(),
490
+ 'grade': course[1].strip(),
491
+ 'credits': float(course[2]) if course[2] else 0.0
492
+ })
493
  return parsed_data
494
+ except Exception as e:
495
+ logging.warning(f"Pattern {pattern} failed: {str(e)}")
496
  continue
497
 
498
  raise ValueError("Could not identify course information in transcript")
 
500
  # ========== ENHANCED ANALYSIS FUNCTIONS ==========
501
  def analyze_gpa(parsed_data: Dict) -> str:
502
  try:
503
+ gpa = float(parsed_data['student_info'].get('weighted_gpa', 0))
504
  if gpa >= 4.5:
505
  return "🌟 Excellent GPA! You're in the top tier of students."
506
  elif gpa >= 3.5:
 
515
  def analyze_graduation_status(parsed_data: Dict) -> str:
516
  try:
517
  total_required = sum(
518
+ float(req.get('required', 0))
519
+ for req in parsed_data.get('requirements', {}).values()
520
+ if req and str(req.get('required', '0')).replace('.', '').isdigit()
521
  )
522
 
523
  total_completed = sum(
524
+ float(req.get('completed', 0))
525
+ for req in parsed_data.get('requirements', {}).values()
526
+ if req and str(req.get('completed', '0')).replace('.', '').isdigit()
527
  )
528
 
529
  completion_percentage = (total_completed / total_required) * 100 if total_required > 0 else 0
 
544
 
545
  # GPA advice
546
  try:
547
+ gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
548
  if gpa < 3.0:
549
  advice.append("πŸ“š Your GPA could improve. Consider:\n- Seeking tutoring for challenging subjects\n- Meeting with teachers during office hours\n- Developing better study habits")
550
  except (TypeError, ValueError, KeyError, AttributeError):
 
552
 
553
  # Community service advice
554
  try:
555
+ service_hours = int(parsed_data.get('student_info', {}).get('community_service_hours', 0))
556
  if service_hours < 100:
557
  advice.append("🀝 Consider more community service:\n- Many colleges value 100+ hours\n- Look for opportunities that align with your interests")
558
  except (TypeError, ValueError, KeyError, AttributeError):
 
561
  # Missing requirements advice
562
  try:
563
  missing_reqs = [
564
+ req for code, req in parsed_data.get('requirements', {}).items()
565
+ if req and float(req.get('percent_complete', 0)) < 100 and not code.startswith("Z-Assessment")
566
  ]
567
 
568
  if missing_reqs:
569
+ req_list = "\n- ".join([f"{code}: {req.get('description', '')}" for code, req in missing_reqs])
570
  advice.append(f"πŸŽ“ Focus on completing these requirements:\n- {req_list}")
571
  except (TypeError, ValueError, KeyError, AttributeError):
572
  pass
573
 
574
  # Course rigor advice
575
  try:
576
+ ap_count = sum(1 for course in parsed_data.get('course_history', [])
577
+ if course and "ADVANCED PLACEMENT" in course.get('description', '').upper())
578
  if ap_count < 3:
579
  advice.append("🧠 Consider taking more challenging courses:\n- AP/IB courses can strengthen college applications\n- Shows academic rigor to admissions officers")
580
  except (TypeError, KeyError, AttributeError):
 
584
 
585
  def generate_college_recommendations(parsed_data: Dict) -> str:
586
  try:
587
+ gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
588
+ ap_count = sum(1 for course in parsed_data.get('course_history', [])
589
+ if course and "ADVANCED PLACEMENT" in course.get('description', '').upper())
590
+ service_hours = int(parsed_data.get('student_info', {}).get('community_service_hours', 0))
591
 
592
  recommendations = []
593
 
 
622
  gpa_data = {
623
  "Type": ["Weighted GPA", "Unweighted GPA"],
624
  "Value": [
625
+ float(parsed_data.get('student_info', {}).get('weighted_gpa', 0)),
626
+ float(parsed_data.get('student_info', {}).get('unweighted_gpa', 0))
627
  ]
628
  }
629
  df = pd.DataFrame(gpa_data)
 
639
  def create_requirements_visualization(parsed_data: Dict):
640
  try:
641
  req_data = []
642
+ for code, req in parsed_data.get('requirements', {}).items():
643
+ if req and req.get('percent_complete'):
644
  completion = float(req['percent_complete'])
645
  req_data.append({
646
  "Requirement": code,
 
696
  parser = TranscriptParser()
697
  try:
698
  parsed_data = parser.parse_transcript(text)
699
+ if not parsed_data:
700
+ raise ValueError("No data could be parsed from the transcript.")
701
  except Exception as e:
702
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
703