Dannyar608 commited on
Commit
ba8e4ab
·
verified ·
1 Parent(s): 0869b6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -137
app.py CHANGED
@@ -155,7 +155,19 @@ class LearningStyleQuiz:
155
  "When learning a new skill, I prefer to:",
156
  "When studying, I like to:",
157
  "I prefer teachers who:",
158
- "When solving problems, I:"
 
 
 
 
 
 
 
 
 
 
 
 
159
  ]
160
 
161
  self.options = [
@@ -166,7 +178,19 @@ class LearningStyleQuiz:
166
  ["Watch demonstrations", "Listen to instructions", "Read instructions", "Jump in and try it"],
167
  ["Use highlighters and diagrams", "Discuss with others", "Read and take notes", "Move around or use objects"],
168
  ["Use visual aids", "Give interesting lectures", "Provide reading materials", "Include hands-on activities"],
169
- ["Draw pictures or diagrams", "Talk through options", "Make lists", "Try different solutions physically"]
 
 
 
 
 
 
 
 
 
 
 
 
170
  ]
171
 
172
  self.learning_styles = {
@@ -250,14 +274,14 @@ class LearningStyleQuiz:
250
  # Initialize learning style quiz
251
  learning_style_quiz = LearningStyleQuiz()
252
 
253
- class EnhancedMiamiDadeTranscriptParser:
254
  def __init__(self):
255
  self.patterns = {
256
  'student_info': re.compile(
257
- r"LEGAL NAME:\s*([A-Z]+,\s*[A-Z]+).*?"
258
  r"GRADE LEVEL:\s*(\d+).*?"
259
  r"FL STUDENT ID:\s*(\w+).*?"
260
- r"CURRENT SCHOOL:\s*(\d+\s+[\w\s]+?)\s*\(",
261
  re.DOTALL
262
  ),
263
  'gpa': re.compile(
@@ -269,7 +293,7 @@ class EnhancedMiamiDadeTranscriptParser:
269
  re.DOTALL
270
  ),
271
  'course': re.compile(
272
- r"(\d)\s+(\w+)\s+([\w\s]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([A-Z])\s+([\d.]+)\s+([\d.]+)",
273
  re.DOTALL
274
  ),
275
  'assessment': re.compile(
@@ -282,92 +306,58 @@ class EnhancedMiamiDadeTranscriptParser:
282
  'class_rank': re.compile(
283
  r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
284
  re.DOTALL
 
 
 
 
285
  )
286
  }
287
 
288
  def parse_transcript(self, file_path: str) -> Dict:
289
- """Parse Miami-Dade transcript PDF with enhanced pattern matching"""
290
  try:
291
  # First try pdfplumber
 
292
  with pdfplumber.open(file_path) as pdf:
293
- text = "\n".join(page.extract_text() for page in pdf.pages)
294
-
 
295
  # Fallback to PyMuPDF if text extraction is poor
296
- if len(text) < 500: # If we got very little text
297
  doc = fitz.open(file_path)
298
  text = ""
299
  for page in doc:
300
  text += page.get_text()
301
 
302
- # Debug: Save extracted text
303
- with open("debug_transcript.txt", "w") as f:
304
- f.write(text)
305
-
306
- return self._parse_format(text)
307
  except Exception as e:
308
  logger.error(f"Error parsing transcript: {str(e)}")
309
  raise ValueError(f"Error processing transcript: {str(e)}")
310
 
311
- def _parse_format(self, text: str) -> Dict:
312
- """Parse the transcript format with improved error handling"""
313
- try:
314
- parsed_data = {
315
- 'student_info': self._parse_student_info(text),
316
- 'academic_summary': self._parse_academic_summary(text),
317
- 'course_history': self._parse_courses(text),
318
- 'assessments': self._parse_assessments(text),
319
- 'format': 'miami_dade_v2'
320
- }
 
 
 
321
 
322
- # Validate we got at least some data
323
- if not parsed_data['student_info'] or not parsed_data['course_history']:
324
- raise ValueError("Incomplete data extracted from transcript")
325
-
326
- return parsed_data
327
- except Exception as e:
328
- logger.error(f"Format parsing error: {str(e)}")
329
- return self._parse_alternative_format(text)
330
-
331
- def _parse_alternative_format(self, text: str) -> Dict:
332
- """Fallback parser for alternative formats"""
333
- try:
334
- parsed_data = {
335
- 'student_info': {},
336
- 'academic_summary': {},
337
- 'course_history': [],
338
- 'assessments': {},
339
- 'format': 'alternative'
340
- }
341
-
342
- # Try to extract basic student info
343
- name_match = re.search(r"NAME:\s*([A-Z]+,\s*[A-Z]+)", text)
344
- if name_match:
345
- parsed_data['student_info']['name'] = name_match.group(1).replace(',', ' ').strip()
346
-
347
- # Try to extract GPA
348
- gpa_match = re.search(r"GPA:\s*([\d.]+)", text)
349
- if gpa_match:
350
- parsed_data['academic_summary']['gpa'] = {
351
- 'district': float(gpa_match.group(1)),
352
- 'state': float(gpa_match.group(1)) # Assume same if not specified
353
- }
354
-
355
- return parsed_data
356
- except Exception as e:
357
- logger.error(f"Alternative parser failed: {str(e)}")
358
- raise ValueError("Could not parse transcript in any supported format")
359
 
360
  def _parse_student_info(self, text: str) -> Dict:
361
  """Extract student information with improved pattern matching"""
362
  match = self.patterns['student_info'].search(text)
363
  if not match:
364
- # Try alternative patterns
365
- match = re.search(r"STUDENT INFORMATION.*?NAME:\s*([^\n]+)", text, re.DOTALL)
366
- if not match:
367
- return {}
368
-
369
  return {
370
- 'name': match.group(1).replace(',', ' ').strip() if match else "Unknown",
371
  'grade': match.group(2) if match and len(match.groups()) > 1 else "Unknown",
372
  'student_id': match.group(3) if match and len(match.groups()) > 2 else "Unknown",
373
  'school': match.group(4).strip() if match and len(match.groups()) > 3 else "Unknown",
@@ -376,55 +366,35 @@ class EnhancedMiamiDadeTranscriptParser:
376
  }
377
 
378
  def _extract_birth_date(self, text: str) -> Optional[str]:
379
- """Extract birth date from transcript with multiple pattern attempts"""
380
- patterns = [
381
- r"BIRTH DATE:\s*(\d{2}/\d{2}/\d{4})",
382
- r"DOB:\s*(\d{2}/\d{2}/\d{4})",
383
- r"DATE OF BIRTH:\s*([^\n]+)"
384
- ]
385
-
386
- for pattern in patterns:
387
- birth_match = re.search(pattern, text)
388
- if birth_match:
389
- return birth_match.group(1)
390
  return None
391
 
392
  def _extract_ethnicity(self, text: str) -> Optional[str]:
393
- """Extract ethnicity information with multiple pattern attempts"""
394
- patterns = [
395
- r"ETHNICITY:\s*([^\n]+)",
396
- r"RACE/ETHNICITY:\s*([^\n]+)",
397
- r"DEMOGRAPHICS.*?ETHNICITY:\s*([^\n]+)"
398
- ]
399
-
400
- for pattern in patterns:
401
- eth_match = re.search(pattern, text, re.DOTALL)
402
- if eth_match:
403
- return eth_match.group(1).strip()
404
  return None
405
 
406
  def _parse_academic_summary(self, text: str) -> Dict:
407
- """Parse academic summary section with improved error handling"""
408
  summary = {
409
  'gpa': {'district': None, 'state': None},
410
  'credits': {},
411
  'class_rank': {'percentile': None, 'class_size': None}
412
  }
413
 
414
- # Try multiple GPA patterns
415
  gpa_match = self.patterns['gpa'].search(text)
416
- if not gpa_match:
417
- gpa_match = re.search(r"GPA.*?([\d.]+).*?([\d.]+)", text)
418
-
419
  if gpa_match:
420
  summary['gpa']['district'] = float(gpa_match.group(1))
421
  summary['gpa']['state'] = float(gpa_match.group(2)) if gpa_match.group(2) else summary['gpa']['district']
422
 
423
- # Try multiple credit patterns
424
  credits_matches = self.patterns['credits'].finditer(text)
425
- if not credits_matches:
426
- credits_matches = re.finditer(r"([A-Z ]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)", text)
427
-
428
  for match in credits_matches:
429
  subject = match.group(1).strip()
430
  summary['credits'][subject] = {
@@ -433,11 +403,8 @@ class EnhancedMiamiDadeTranscriptParser:
433
  'remaining': float(match.group(4)) if match.group(4) else None
434
  }
435
 
436
- # Try multiple class rank patterns
437
  rank_match = self.patterns['class_rank'].search(text)
438
- if not rank_match:
439
- rank_match = re.search(r"RANK.*?(\d+).*?(\d+)", text)
440
-
441
  if rank_match:
442
  summary['class_rank']['percentile'] = int(rank_match.group(1))
443
  summary['class_rank']['class_size'] = int(rank_match.group(2))
@@ -445,48 +412,40 @@ class EnhancedMiamiDadeTranscriptParser:
445
  return summary
446
 
447
  def _parse_courses(self, text: str) -> List[Dict]:
448
- """Parse course history section with improved pattern matching"""
449
  courses = []
450
 
451
  # Try primary pattern first
452
  for match in self.patterns['course'].finditer(text):
453
- courses.append(self._create_course_dict(match))
 
 
 
 
 
 
 
 
 
 
454
 
455
- # If no courses found, try alternative patterns
456
  if not courses:
457
- alt_pattern = re.compile(
458
- r"(\d{4}-\d{4})\s+(\w+)\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([\d.]+)",
459
- re.DOTALL
460
- )
461
- for match in alt_pattern.finditer(text):
462
  courses.append({
463
  'term': match.group(1),
464
  'course_code': match.group(2),
465
  'course_title': match.group(3).strip(),
466
  'subject_area': match.group(4),
467
  'grade': match.group(5),
468
- 'credit_earned': float(match.group(6)),
469
- 'credit_attempted': float(match.group(6))
470
  })
471
 
472
  return courses
473
 
474
- def _create_course_dict(self, match) -> Dict:
475
- """Create standardized course dictionary from regex match"""
476
- return {
477
- 'term': match.group(1),
478
- 'course_code': match.group(2),
479
- 'course_title': match.group(3).strip(),
480
- 'subject_area': match.group(4),
481
- 'grade': match.group(5),
482
- 'flag': match.group(6),
483
- 'credit_status': match.group(7),
484
- 'credit_attempted': float(match.group(8)),
485
- 'credit_earned': float(match.group(9))
486
- }
487
-
488
  def _parse_assessments(self, text: str) -> Dict:
489
- """Parse assessment and requirement information with improved patterns"""
490
  assessments = {
491
  'ela_passed_date': None,
492
  'algebra_passed': False,
@@ -497,11 +456,7 @@ class EnhancedMiamiDadeTranscriptParser:
497
  }
498
  }
499
 
500
- # Try multiple assessment patterns
501
  matches = self.patterns['assessment'].finditer(text)
502
- if not matches:
503
- matches = re.finditer(r"(ENGLISH|ALGEBRA|BIOLOGY|SERVICE).*?(PASSED|MET|YES|NO|\d{2}/\d{4})", text)
504
-
505
  for match in matches:
506
  if match.group(1): # ELA date
507
  assessments['ela_passed_date'] = match.group(1)
@@ -517,8 +472,8 @@ class EnhancedMiamiDadeTranscriptParser:
517
 
518
  return assessments
519
 
520
- # Initialize the enhanced parser
521
- transcript_parser = EnhancedMiamiDadeTranscriptParser()
522
 
523
  class AcademicAnalyzer:
524
  def __init__(self):
@@ -547,7 +502,7 @@ class AcademicAnalyzer:
547
  if parsed_data.get('format') == 'progress_summary':
548
  weighted_gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
549
  unweighted_gpa = float(parsed_data.get('student_info', {}).get('unweighted_gpa', 0))
550
- elif parsed_data.get('format') == 'miami_dade_v2':
551
  weighted_gpa = float(parsed_data.get('academic_summary', {}).get('gpa', {}).get('district', 0))
552
  unweighted_gpa = float(parsed_data.get('academic_summary', {}).get('gpa', {}).get('state', 0))
553
  else: # Alternative format
@@ -681,7 +636,7 @@ class AcademicAnalyzer:
681
  'remaining': max(0, info.get('required', 0) - info.get('earned', 0))
682
  }
683
  for subject, info in credits.items()
684
- if info and info.get('required', 0) > info.get('earned', 0)
685
  ]
686
 
687
  current_grade = parsed_data.get('student_info', {}).get('grade', '')
@@ -2584,4 +2539,5 @@ def create_enhanced_interface():
2584
  app = create_enhanced_interface()
2585
 
2586
  if __name__ == "__main__":
2587
- app.launch(server_name="0.0.0.0", server_port=7860)
 
 
155
  "When learning a new skill, I prefer to:",
156
  "When studying, I like to:",
157
  "I prefer teachers who:",
158
+ "When solving problems, I:",
159
+ "When working on a group project, I:",
160
+ "My ideal study environment is:",
161
+ "When preparing for a test, I:",
162
+ "When reading instructions, I:",
163
+ "When explaining something to someone, I:",
164
+ "When taking notes in class, I:",
165
+ "When using a new device or app, I:",
166
+ "When remembering names, I:",
167
+ "When choosing a book to read, I:",
168
+ "When giving a presentation, I:",
169
+ "When organizing my work, I:",
170
+ "When relaxing, I enjoy:"
171
  ]
172
 
173
  self.options = [
 
178
  ["Watch demonstrations", "Listen to instructions", "Read instructions", "Jump in and try it"],
179
  ["Use highlighters and diagrams", "Discuss with others", "Read and take notes", "Move around or use objects"],
180
  ["Use visual aids", "Give interesting lectures", "Provide reading materials", "Include hands-on activities"],
181
+ ["Draw pictures or diagrams", "Talk through options", "Make lists", "Try different solutions physically"],
182
+ ["Create visual plans", "Discuss ideas verbally", "Write detailed plans", "Take on hands-on tasks"],
183
+ ["Somewhere quiet with good lighting", "Somewhere I can discuss ideas", "A library with lots of resources", "Somewhere I can move around"],
184
+ ["Create visual study aids", "Recite information aloud", "Write summaries", "Create physical models"],
185
+ ["Look at diagrams first", "Have someone explain them", "Read them carefully", "Try to follow them as I go"],
186
+ ["Draw diagrams or pictures", "Explain verbally", "Write detailed explanations", "Show by doing"],
187
+ ["Draw diagrams and symbols", "Record lectures to listen later", "Write detailed notes", "Underline and highlight"],
188
+ ["Look at the screen layout", "Listen to audio instructions", "Read the manual", "Start clicking buttons"],
189
+ ["Remember faces better than names", "Remember names when I hear them", "Remember names when I see them written", "Remember people by activities we did"],
190
+ ["Choose books with pictures/diagrams", "Choose audiobooks", "Choose text-heavy books", "Choose interactive books"],
191
+ ["Use lots of visual aids", "Focus on my verbal delivery", "Provide handouts", "Use props or demonstrations"],
192
+ ["Use color-coding systems", "Talk through my plan", "Make detailed lists", "Physically arrange materials"],
193
+ ["Watching videos or art", "Listening to music/podcasts", "Reading", "Doing physical activities"]
194
  ]
195
 
196
  self.learning_styles = {
 
274
  # Initialize learning style quiz
275
  learning_style_quiz = LearningStyleQuiz()
276
 
277
+ class MiamiDadeTranscriptParser:
278
  def __init__(self):
279
  self.patterns = {
280
  'student_info': re.compile(
281
+ r"LEGAL NAME:\s*([^\n]+?)\s*MAILING\s+ADDRESS:.*?"
282
  r"GRADE LEVEL:\s*(\d+).*?"
283
  r"FL STUDENT ID:\s*(\w+).*?"
284
+ r"CURRENT SCHOOL:\s*(\d+\s+[^\n]+?)\s*\(",
285
  re.DOTALL
286
  ),
287
  'gpa': re.compile(
 
293
  re.DOTALL
294
  ),
295
  'course': re.compile(
296
+ r"(\d)\s+(\w+)\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([A-Z])\s+([\d.]+)\s+([\d.]+)",
297
  re.DOTALL
298
  ),
299
  'assessment': re.compile(
 
306
  'class_rank': re.compile(
307
  r"\*\s+PERCENTILE:\s*(\d+)\s*\*\s*TOTAL NUMBER IN CLASS:\s*(\d+)",
308
  re.DOTALL
309
+ ),
310
+ 'course_alt': re.compile(
311
+ r"(\d)\s+(\w+)\s+([^\n]+?)\s+([A-Z]{2})\s+([A-Z])\s+([A-Z])\s+([\d.]+)\s+([\d.]+)",
312
+ re.DOTALL
313
  )
314
  }
315
 
316
  def parse_transcript(self, file_path: str) -> Dict:
317
+ """Parse Miami-Dade transcript PDF with multiple extraction methods"""
318
  try:
319
  # First try pdfplumber
320
+ text = ""
321
  with pdfplumber.open(file_path) as pdf:
322
+ for page in pdf.pages:
323
+ text += page.extract_text() + "\n"
324
+
325
  # Fallback to PyMuPDF if text extraction is poor
326
+ if len(text) < 500:
327
  doc = fitz.open(file_path)
328
  text = ""
329
  for page in doc:
330
  text += page.get_text()
331
 
332
+ return self._parse_miami_dade_format(text)
 
 
 
 
333
  except Exception as e:
334
  logger.error(f"Error parsing transcript: {str(e)}")
335
  raise ValueError(f"Error processing transcript: {str(e)}")
336
 
337
+ def _parse_miami_dade_format(self, text: str) -> Dict:
338
+ """Parse the specific Miami-Dade transcript format"""
339
+ parsed_data = {
340
+ 'student_info': self._parse_student_info(text),
341
+ 'academic_summary': self._parse_academic_summary(text),
342
+ 'course_history': self._parse_courses(text),
343
+ 'assessments': self._parse_assessments(text),
344
+ 'format': 'miami_dade_v3'
345
+ }
346
+
347
+ # Validate we got at least some data
348
+ if not parsed_data['student_info'] or not parsed_data['course_history']:
349
+ raise ValueError("Incomplete data extracted from transcript")
350
 
351
+ return parsed_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
  def _parse_student_info(self, text: str) -> Dict:
354
  """Extract student information with improved pattern matching"""
355
  match = self.patterns['student_info'].search(text)
356
  if not match:
357
+ return {}
358
+
 
 
 
359
  return {
360
+ 'name': match.group(1).strip(),
361
  'grade': match.group(2) if match and len(match.groups()) > 1 else "Unknown",
362
  'student_id': match.group(3) if match and len(match.groups()) > 2 else "Unknown",
363
  'school': match.group(4).strip() if match and len(match.groups()) > 3 else "Unknown",
 
366
  }
367
 
368
  def _extract_birth_date(self, text: str) -> Optional[str]:
369
+ """Extract birth date from transcript"""
370
+ birth_match = re.search(r"BIRTH DATE:\s*(\d{2}/\d{2}/\d{4})", text)
371
+ if birth_match:
372
+ return birth_match.group(1)
 
 
 
 
 
 
 
373
  return None
374
 
375
  def _extract_ethnicity(self, text: str) -> Optional[str]:
376
+ """Extract ethnicity information"""
377
+ eth_match = re.search(r"ETHNICITY:\s*([^\n]+)", text)
378
+ if eth_match:
379
+ return eth_match.group(1).strip()
 
 
 
 
 
 
 
380
  return None
381
 
382
  def _parse_academic_summary(self, text: str) -> Dict:
383
+ """Parse academic summary section"""
384
  summary = {
385
  'gpa': {'district': None, 'state': None},
386
  'credits': {},
387
  'class_rank': {'percentile': None, 'class_size': None}
388
  }
389
 
390
+ # GPA
391
  gpa_match = self.patterns['gpa'].search(text)
 
 
 
392
  if gpa_match:
393
  summary['gpa']['district'] = float(gpa_match.group(1))
394
  summary['gpa']['state'] = float(gpa_match.group(2)) if gpa_match.group(2) else summary['gpa']['district']
395
 
396
+ # Credits
397
  credits_matches = self.patterns['credits'].finditer(text)
 
 
 
398
  for match in credits_matches:
399
  subject = match.group(1).strip()
400
  summary['credits'][subject] = {
 
403
  'remaining': float(match.group(4)) if match.group(4) else None
404
  }
405
 
406
+ # Class Rank
407
  rank_match = self.patterns['class_rank'].search(text)
 
 
 
408
  if rank_match:
409
  summary['class_rank']['percentile'] = int(rank_match.group(1))
410
  summary['class_rank']['class_size'] = int(rank_match.group(2))
 
412
  return summary
413
 
414
  def _parse_courses(self, text: str) -> List[Dict]:
415
+ """Parse course history section"""
416
  courses = []
417
 
418
  # Try primary pattern first
419
  for match in self.patterns['course'].finditer(text):
420
+ courses.append({
421
+ 'term': match.group(1),
422
+ 'course_code': match.group(2),
423
+ 'course_title': match.group(3).strip(),
424
+ 'subject_area': match.group(4),
425
+ 'grade': match.group(5),
426
+ 'flag': match.group(6),
427
+ 'credit_status': match.group(7),
428
+ 'credit_attempted': float(match.group(8)),
429
+ 'credit_earned': float(match.group(9))
430
+ })
431
 
432
+ # If no courses found, try alternative pattern
433
  if not courses:
434
+ for match in self.patterns['course_alt'].finditer(text):
 
 
 
 
435
  courses.append({
436
  'term': match.group(1),
437
  'course_code': match.group(2),
438
  'course_title': match.group(3).strip(),
439
  'subject_area': match.group(4),
440
  'grade': match.group(5),
441
+ 'credit_attempted': float(match.group(6)),
442
+ 'credit_earned': float(match.group(7))
443
  })
444
 
445
  return courses
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  def _parse_assessments(self, text: str) -> Dict:
448
+ """Parse assessment and requirement information"""
449
  assessments = {
450
  'ela_passed_date': None,
451
  'algebra_passed': False,
 
456
  }
457
  }
458
 
 
459
  matches = self.patterns['assessment'].finditer(text)
 
 
 
460
  for match in matches:
461
  if match.group(1): # ELA date
462
  assessments['ela_passed_date'] = match.group(1)
 
472
 
473
  return assessments
474
 
475
+ # Initialize the parser
476
+ transcript_parser = MiamiDadeTranscriptParser()
477
 
478
  class AcademicAnalyzer:
479
  def __init__(self):
 
502
  if parsed_data.get('format') == 'progress_summary':
503
  weighted_gpa = float(parsed_data.get('student_info', {}).get('weighted_gpa', 0))
504
  unweighted_gpa = float(parsed_data.get('student_info', {}).get('unweighted_gpa', 0))
505
+ elif parsed_data.get('format') == 'miami_dade_v3':
506
  weighted_gpa = float(parsed_data.get('academic_summary', {}).get('gpa', {}).get('district', 0))
507
  unweighted_gpa = float(parsed_data.get('academic_summary', {}).get('gpa', {}).get('state', 0))
508
  else: # Alternative format
 
636
  'remaining': max(0, info.get('required', 0) - info.get('earned', 0))
637
  }
638
  for subject, info in credits.items()
639
+ if info and info.get('required', 0) > info.get('earned', 0))
640
  ]
641
 
642
  current_grade = parsed_data.get('student_info', {}).get('grade', '')
 
2539
  app = create_enhanced_interface()
2540
 
2541
  if __name__ == "__main__":
2542
+ app.launch(server_name="0.0.0.0", server_port=7860)
2543
+