Dannyar608 commited on
Commit
2e08701
·
verified ·
1 Parent(s): 17a6b1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -130
app.py CHANGED
@@ -252,26 +252,10 @@ class LearningStyleQuiz:
252
  # Initialize learning style quiz
253
  learning_style_quiz = LearningStyleQuiz()
254
 
255
- # ========== TRANSCRIPT PARSER ==========
256
- class MiamiDadeTranscriptParser:
257
  def __init__(self):
258
- # Patterns for both transcript formats
259
- self.format1_patterns = {
260
- 'student_info': re.compile(
261
- r"(\d{7}) - (.*?)\s*\|\s*Current Grade:\s*(\d+)\s*\|\s*YOG\s*(\d{4})"
262
- r"\s*\|\s*Weighted GPA\s*([\d.]+)\s*\|\s*Comm Serv Date\s*(\d{2}/\d{2}/\d{4})"
263
- r"\s*\|\s*Total Credits Earned\s*([\d.]+)"
264
- ),
265
- 'requirement': re.compile(
266
- r"([A-Z]-[A-Za-z ]+)\s*\|\s*([^|]+)\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([^|]+)%"
267
- ),
268
- 'course': re.compile(
269
- r"([A-Z]-[A-Za-z ]+)\s*\|\s*(\d{4}-\d{4})\s*\|\s*(\d{2})\s*\|\s*([A-Z0-9]+)\s*\|\s*([^|]+)\|"
270
- r"\s*([A-Z0-9])\s*\|\s*(\d+)\s*\|\s*([A-Z])\s*\|\s*([A-Z])\s*\|\s*([\d.]+|inProgress)"
271
- )
272
- }
273
-
274
- self.format2_patterns = {
275
  'student_info': re.compile(
276
  r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
277
  r"GRADE LEVEL:\s*(\d+).*?"
@@ -297,98 +281,42 @@ class MiamiDadeTranscriptParser:
297
  r"BIOLOGY ASSESSMENT PASSED|"
298
  r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
299
  re.DOTALL
 
 
 
 
300
  )
301
  }
302
-
303
  def parse_transcript(self, file_path: str) -> Dict:
304
- """Parse Miami-Dade transcript PDF, automatically detecting format"""
305
- with pdfplumber.open(file_path) as pdf:
306
- text = "\n".join(page.extract_text() for page in pdf.pages)
307
-
308
- # Clean up text
309
- text = re.sub(r'\s+', ' ', text)
310
- text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)
311
-
312
- # Detect format
313
- if "GRADUATION PROGRESS SUMMARY" in text:
314
- return self._parse_format1(text)
315
- elif "CUMULATIVE SUMMARY" in text:
316
- return self._parse_format2(text)
317
- else:
318
- raise ValueError("Unrecognized transcript format")
319
-
320
- def _parse_format1(self, text: str) -> Dict:
321
- """Parse the first transcript format"""
322
- parsed_data = {
323
- 'student_info': self._parse_format1_student_info(text),
324
- 'requirements': self._parse_format1_requirements(text),
325
- 'course_history': self._parse_format1_courses(text),
326
- 'format': 'progress_summary'
327
- }
328
- return parsed_data
329
-
330
- def _parse_format1_student_info(self, text: str) -> Dict:
331
- """Extract student information from format 1"""
332
- match = self.format1_patterns['student_info'].search(text)
333
- if not match:
334
- return {}
335
-
336
- return {
337
- 'id': match.group(1),
338
- 'name': match.group(2).strip(),
339
- 'grade': match.group(3),
340
- 'year_of_graduation': match.group(4),
341
- 'weighted_gpa': float(match.group(5)),
342
- 'community_service_date': match.group(6),
343
- 'total_credits': float(match.group(7)),
344
- 'district': 'Miami-Dade'
345
- }
346
-
347
- def _parse_format1_requirements(self, text: str) -> Dict:
348
- """Parse graduation requirements section from format 1"""
349
- requirements = {}
350
- for match in self.format1_patterns['requirement'].finditer(text):
351
- requirements[match.group(1).strip()] = {
352
- 'description': match.group(2).strip(),
353
- 'required': float(match.group(3)),
354
- 'waived': float(match.group(4)),
355
- 'completed': float(match.group(5)),
356
- 'percent_complete': float(match.group(6))
357
- }
358
- return requirements
359
-
360
- def _parse_format1_courses(self, text: str) -> List[Dict]:
361
- """Parse course history section from format 1"""
362
- courses = []
363
- for match in self.format1_patterns['course'].finditer(text):
364
- courses.append({
365
- 'requirement': match.group(1).strip(),
366
- 'school_year': match.group(2),
367
- 'grade_level': match.group(3),
368
- 'course_code': match.group(4),
369
- 'description': match.group(5).strip(),
370
- 'term': match.group(6),
371
- 'district_number': match.group(7),
372
- 'included': match.group(8),
373
- 'credits': 0 if 'inProgress' in match.group(9) else float(match.group(9)),
374
- 'status': 'In Progress' if 'inProgress' in match.group(9) else 'Completed'
375
- })
376
- return courses
377
 
378
- def _parse_format2(self, text: str) -> Dict:
379
- """Parse the second transcript format"""
380
  parsed_data = {
381
- 'student_info': self._parse_format2_student_info(text),
382
- 'academic_summary': self._parse_format2_academic_summary(text),
383
- 'course_history': self._parse_format2_courses(text),
384
- 'assessments': self._parse_format2_assessments(text),
385
- 'format': 'cumulative_summary'
386
  }
387
  return parsed_data
388
 
389
- def _parse_format2_student_info(self, text: str) -> Dict:
390
- """Extract student information from format 2"""
391
- match = self.format2_patterns['student_info'].search(text)
392
  if not match:
393
  return {}
394
 
@@ -411,10 +339,11 @@ class MiamiDadeTranscriptParser:
411
  eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
412
  return eth_match.group(1).strip() if eth_match else None
413
 
414
- def _parse_format2_academic_summary(self, text: str) -> Dict:
415
- """Parse academic summary section from format 2"""
416
- gpa_match = self.format2_patterns['gpa'].search(text)
417
- credits_matches = self.format2_patterns['credits'].finditer(text)
 
418
 
419
  summary = {
420
  'gpa': {
@@ -422,7 +351,10 @@ class MiamiDadeTranscriptParser:
422
  'state': float(gpa_match.group(2)) if gpa_match else None
423
  },
424
  'credits': {},
425
- 'class_rank': self._extract_class_rank(text)
 
 
 
426
  }
427
 
428
  for match in credits_matches:
@@ -435,21 +367,10 @@ class MiamiDadeTranscriptParser:
435
 
436
  return summary
437
 
438
- def _extract_class_rank(self, text: str) -> Dict:
439
- """Extract class rank information"""
440
- rank_match = re.search(
441
- r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
442
- text
443
- )
444
- return {
445
- 'percentile': int(rank_match.group(1)) if rank_match else None,
446
- 'class_size': int(rank_match.group(2)) if rank_match else None
447
- }
448
-
449
- def _parse_format2_courses(self, text: str) -> List[Dict]:
450
- """Parse course history section from format 2"""
451
  courses = []
452
- for match in self.format2_patterns['course'].finditer(text):
453
  courses.append({
454
  'term': match.group(1),
455
  'course_code': match.group(2),
@@ -463,9 +384,9 @@ class MiamiDadeTranscriptParser:
463
  })
464
  return courses
465
 
466
- def _parse_format2_assessments(self, text: str) -> Dict:
467
- """Parse assessment and requirement information from format 2"""
468
- matches = self.format2_patterns['assessment'].finditer(text)
469
  assessments = {
470
  'ela_passed_date': None,
471
  'algebra_passed': False,
@@ -491,8 +412,8 @@ class MiamiDadeTranscriptParser:
491
 
492
  return assessments
493
 
494
- # Initialize transcript parser
495
- transcript_parser = MiamiDadeTranscriptParser()
496
 
497
  # ========== ACADEMIC ANALYZER ==========
498
  class AcademicAnalyzer:
@@ -600,7 +521,6 @@ class AcademicAnalyzer:
600
 
601
  try:
602
  if parsed_data.get('format') == 'progress_summary':
603
- # Format 1 analysis
604
  total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
605
  if total_match:
606
  analysis['completion_percentage'] = float(total_match.group(4))
@@ -628,7 +548,6 @@ class AcademicAnalyzer:
628
  if req and float(req.get('completed', 0)) < float(req.get('required', 0))
629
  ]
630
  else:
631
- # Format 2 analysis
632
  credits = parsed_data.get('academic_summary', {}).get('credits', {})
633
  total_required = sum(
634
  v.get('required', 0)
@@ -1466,7 +1385,7 @@ class EnhancedTeachingAssistant:
1466
  service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
1467
  else:
1468
  gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
1469
- service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0)
1470
 
1471
  learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
1472
  profile.get('learning_style', ''))
 
252
  # Initialize learning style quiz
253
  learning_style_quiz = LearningStyleQuiz()
254
 
255
+ # ========== ENHANCED TRANSCRIPT PARSER ==========
256
+ class EnhancedMiamiDadeTranscriptParser:
257
  def __init__(self):
258
+ self.patterns = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  'student_info': re.compile(
260
  r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
261
  r"GRADE LEVEL:\s*(\d+).*?"
 
281
  r"BIOLOGY ASSESSMENT PASSED|"
282
  r"DISTRICT COMM/VOL SERVICE RQMT MET:\s*(YES).*?HRS:\s*(\d+)",
283
  re.DOTALL
284
+ ),
285
+ 'class_rank': re.compile(
286
+ r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
287
+ re.DOTALL
288
  )
289
  }
290
+
291
  def parse_transcript(self, file_path: str) -> Dict:
292
+ """Parse Miami-Dade transcript PDF with enhanced pattern matching"""
293
+ try:
294
+ with pdfplumber.open(file_path) as pdf:
295
+ text = "\n".join(page.extract_text() for page in pdf.pages)
296
+
297
+ # Clean up text
298
+ text = re.sub(r'\s+', ' ', text)
299
+ text = re.sub(r'(?<=\d)\s+(?=\d)', '', text)
300
+
301
+ return self._parse_format(text)
302
+ except Exception as e:
303
+ logger.error(f"Error parsing transcript: {str(e)}")
304
+ raise ValueError(f"Error processing transcript: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ def _parse_format(self, text: str) -> Dict:
307
+ """Parse the transcript format shown in the example"""
308
  parsed_data = {
309
+ 'student_info': self._parse_student_info(text),
310
+ 'academic_summary': self._parse_academic_summary(text),
311
+ 'course_history': self._parse_courses(text),
312
+ 'assessments': self._parse_assessments(text),
313
+ 'format': 'cumulative_summary_v2'
314
  }
315
  return parsed_data
316
 
317
+ def _parse_student_info(self, text: str) -> Dict:
318
+ """Extract student information"""
319
+ match = self.patterns['student_info'].search(text)
320
  if not match:
321
  return {}
322
 
 
339
  eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
340
  return eth_match.group(1).strip() if eth_match else None
341
 
342
+ def _parse_academic_summary(self, text: str) -> Dict:
343
+ """Parse academic summary section"""
344
+ gpa_match = self.patterns['gpa'].search(text)
345
+ credits_matches = self.patterns['credits'].finditer(text)
346
+ rank_match = self.patterns['class_rank'].search(text)
347
 
348
  summary = {
349
  'gpa': {
 
351
  'state': float(gpa_match.group(2)) if gpa_match else None
352
  },
353
  'credits': {},
354
+ 'class_rank': {
355
+ 'percentile': int(rank_match.group(1)) if rank_match else None,
356
+ 'class_size': int(rank_match.group(2)) if rank_match else None
357
+ }
358
  }
359
 
360
  for match in credits_matches:
 
367
 
368
  return summary
369
 
370
+ def _parse_courses(self, text: str) -> List[Dict]:
371
+ """Parse course history section"""
 
 
 
 
 
 
 
 
 
 
 
372
  courses = []
373
+ for match in self.patterns['course'].finditer(text):
374
  courses.append({
375
  'term': match.group(1),
376
  'course_code': match.group(2),
 
384
  })
385
  return courses
386
 
387
+ def _parse_assessments(self, text: str) -> Dict:
388
+ """Parse assessment and requirement information"""
389
+ matches = self.patterns['assessment'].finditer(text)
390
  assessments = {
391
  'ela_passed_date': None,
392
  'algebra_passed': False,
 
412
 
413
  return assessments
414
 
415
+ # Initialize the enhanced parser
416
+ transcript_parser = EnhancedMiamiDadeTranscriptParser()
417
 
418
  # ========== ACADEMIC ANALYZER ==========
419
  class AcademicAnalyzer:
 
521
 
522
  try:
523
  if parsed_data.get('format') == 'progress_summary':
 
524
  total_match = re.search(r'Total\s*\|\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)%', text)
525
  if total_match:
526
  analysis['completion_percentage'] = float(total_match.group(4))
 
548
  if req and float(req.get('completed', 0)) < float(req.get('required', 0))
549
  ]
550
  else:
 
551
  credits = parsed_data.get('academic_summary', {}).get('credits', {})
552
  total_required = sum(
553
  v.get('required', 0)
 
1385
  service_hours = transcript.get('student_info', {}).get('community_service_hours', 0)
1386
  else:
1387
  gpa = transcript.get('academic_summary', {}).get('gpa', {}).get('district', None)
1388
+ service_hours = transcript.get('assessments', {}).get('community_service', {}).get('hours', 0))
1389
 
1390
  learning_style = re.search(r"Your primary learning style is\s*\*\*(.*?)\*\*",
1391
  profile.get('learning_style', ''))