Dannyar608 commited on
Commit
4ed126e
·
verified ·
1 Parent(s): 7c4445e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -188
app.py CHANGED
@@ -23,6 +23,7 @@ import asyncio
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
 
26
 
27
  # ========== CONFIGURATION ==========
28
  PROFILES_DIR = "student_profiles"
@@ -196,16 +197,20 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
196
 
197
  try:
198
  if file_ext == '.pdf':
199
- # First try PyMuPDF for better text extraction
200
  try:
 
 
 
 
 
 
201
  doc = fitz.open(file_path)
202
  for page in doc:
203
  text += page.get_text("text") + '\n'
204
  if not text.strip():
205
- raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
206
- except Exception as e:
207
- logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
208
- text = extract_text_from_pdf_with_ocr(file_path)
209
 
210
  elif file_ext in ['.png', '.jpg', '.jpeg']:
211
  text = extract_text_with_ocr(file_path)
@@ -293,58 +298,20 @@ class TranscriptParser:
293
  self.current_courses = []
294
  self.course_history = []
295
  self.graduation_status = {}
296
- self.supported_formats = {
297
- 'miami_dade': self.parse_miami_dade,
298
- 'standard': self.parse_standard,
299
- 'homeschool': self.parse_homeschool
300
- }
301
 
302
  def parse_transcript(self, text: str) -> Dict:
303
- """Enhanced parsing method with format detection"""
304
  try:
305
- # First normalize the text (replace multiple spaces, normalize line breaks)
306
- text = re.sub(r'\s+', ' ', text)
307
-
308
- # Detect transcript format
309
- format_type = self.detect_format(text)
310
-
311
- # Parse based on detected format
312
- if format_type in self.supported_formats:
313
- return self.supported_formats[format_type](text)
314
- else:
315
- # Fallback to standard parsing
316
- return self.parse_standard(text)
317
-
318
- except Exception as e:
319
- logging.error(f"Error parsing transcript: {str(e)}")
320
- raise gr.Error(f"Error parsing transcript: {str(e)}\n\nThis may be due to an unsupported transcript format. Please ensure you're uploading an official transcript or contact support.")
321
-
322
- def detect_format(self, text: str) -> str:
323
- """Detect the transcript format"""
324
- # Check for Miami-Dade specific patterns
325
- if re.search(r'MIAMI-DADE (COUNTY|COUNTRY) PUBLIC SCHOOLS', text, re.IGNORECASE):
326
- return 'miami_dade'
327
- # Check for homeschool patterns
328
- elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
329
- return 'homeschool'
330
- # Default to standard format
331
- return 'standard'
332
-
333
- def parse_miami_dade(self, text: str) -> Dict:
334
- """Parse Miami-Dade formatted transcripts with enhanced error handling"""
335
- try:
336
- # Extract student info with more robust patterns
337
  student_match = re.search(
338
- r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
339
- r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
340
- r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
341
  text, re.DOTALL
342
  )
343
-
344
  if student_match:
345
  self.student_data = {
346
- "id": student_match.group(1),
347
- "name": student_match.group(2).replace(",", ", "),
348
  "current_grade": student_match.group(3),
349
  "graduation_year": student_match.group(4),
350
  "unweighted_gpa": float(student_match.group(5)),
@@ -352,59 +319,51 @@ class TranscriptParser:
352
  "total_credits": float(student_match.group(7)),
353
  "community_service_hours": int(student_match.group(8))
354
  }
355
-
356
- # Extract requirements with better table parsing
 
357
  req_section = re.search(
358
- r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
359
  text, re.DOTALL
360
  )
361
-
362
  if req_section:
363
- req_matches = re.finditer(
364
- r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
365
- req_section.group(1)
366
- )
367
-
368
- for match in req_matches:
369
- req_code = match.group(1).strip()
370
- self.requirements[req_code] = {
371
- "description": match.group(2).strip(),
372
- "required": float(match.group(3)),
373
- "waived": float(match.group(4)),
374
- "completed": float(match.group(5)),
375
- "status": f"{match.group(6)}%"
376
- }
377
-
378
- # Extract course history with more flexible parsing
379
- course_section = re.search(
380
- r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description(.*?)(?=Legend for Incl:|$)",
381
- text, re.DOTALL
382
  )
383
-
384
- if course_section:
385
- course_matches = re.finditer(
386
- r"([A-Z]-[\w\s\(\)&-]+)\s*(\d{4}-\d{4}|\d{1,2})\s*(\d{2})\s*([A-Z0-9]+)\s*([^\n]+?)\s*([A-Z0-9]+)\s*([A-Z0-9]+)\s*([A-Z])\s*([A-Z])\s*([\d.]+|inProgress)",
387
- course_section.group(1)
388
- )
389
-
390
- for match in course_matches:
391
- self.course_history.append({
392
- "requirement_category": match.group(1).strip(),
393
- "school_year": match.group(2),
394
- "grade_level": match.group(3),
395
- "course_code": match.group(4),
396
- "description": match.group(5).strip(),
397
- "term": match.group(6),
398
- "district_number": match.group(7),
399
- "grade": match.group(8),
400
- "inclusion_status": match.group(9),
401
- "credits": match.group(10)
402
- })
403
-
404
- # Identify current courses
405
  self._extract_current_courses()
406
  self._calculate_completion()
407
-
408
  return {
409
  "student_info": self.student_data,
410
  "requirements": self.requirements,
@@ -413,85 +372,13 @@ class TranscriptParser:
413
  "graduation_status": self.graduation_status,
414
  "format": "miami_dade"
415
  }
416
-
417
- except Exception as e:
418
- logging.error(f"Error parsing Miami-Dade transcript: {str(e)}")
419
- raise ValueError(f"Couldn't parse transcript. Please ensure it's a valid Miami-Dade transcript. Error: {str(e)}")
420
 
421
- def parse_standard(self, text: str) -> Dict:
422
- """Parse standard formatted transcripts"""
423
- # Extract student info
424
- student_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
425
- if student_match:
426
- self.student_data["name"] = student_match.group(1).strip()
427
-
428
- # Extract courses - looking for a table-like structure
429
- course_pattern = r"(?P<year>\d{4}-\d{4}|\d{1,2})\s+(?P<subject>\w+)\s+(?P<code>\w+)\s+(?P<title>[^\n]+)\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
430
- course_matches = re.finditer(course_pattern, text)
431
-
432
- for match in course_matches:
433
- self.course_history.append({
434
- "school_year": match.group("year"),
435
- "subject": match.group("subject"),
436
- "course_code": match.group("code"),
437
- "description": match.group("title").strip(),
438
- "grade": match.group("grade"),
439
- "credits": match.group("credit")
440
- })
441
-
442
- # Extract GPA info
443
- gpa_pattern = r"GPA\s*([\d.]+)\s*/\s*([\d.]+)"
444
- gpa_match = re.search(gpa_pattern, text)
445
- if gpa_match:
446
- self.student_data.update({
447
- "unweighted_gpa": float(gpa_match.group(1)),
448
- "weighted_gpa": float(gpa_match.group(2))
449
- })
450
-
451
- return {
452
- "student_info": self.student_data,
453
- "course_history": self.course_history,
454
- "format": "standard"
455
- }
456
-
457
- def parse_homeschool(self, text: str) -> Dict:
458
- """Parse homeschool formatted transcripts"""
459
- # Extract student info
460
- name_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
461
- if name_match:
462
- self.student_data["name"] = name_match.group(1).strip()
463
-
464
- # Extract homeschool-specific info
465
- parent_match = re.search(r"Parent:\s*([^\n]+)", text, re.IGNORECASE)
466
- if parent_match:
467
- self.student_data["parent"] = parent_match.group(1).strip()
468
-
469
- # Extract courses - homeschool format often has simpler tables
470
- course_pattern = r"(?P<subject>\w+)\s+(?P<title>[^\n]+?)\s+(?P<date>\w+-\d{4})\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
471
- course_matches = re.finditer(course_pattern, text)
472
-
473
- for match in course_matches:
474
- self.course_history.append({
475
- "subject": match.group("subject"),
476
- "description": match.group("title").strip(),
477
- "completion_date": match.group("date"),
478
- "grade": match.group("grade"),
479
- "credits": match.group("credit")
480
- })
481
-
482
- # Extract GPA info
483
- gpa_match = re.search(r"Cumulative GPA:\s*([\d.]+)", text, re.IGNORECASE)
484
- if gpa_match:
485
- self.student_data["gpa"] = float(gpa_match.group(1))
486
-
487
- return {
488
- "student_info": self.student_data,
489
- "course_history": self.course_history,
490
- "format": "homeschool"
491
- }
492
 
493
  def _extract_current_courses(self):
494
- """Identify courses currently in progress"""
495
  self.current_courses = [
496
  {
497
  "course": c["description"],
@@ -501,32 +388,21 @@ class TranscriptParser:
501
  "credits": c["credits"],
502
  "grade_level": c["grade_level"]
503
  }
504
- for c in self.course_history
505
- if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
506
  ]
507
-
508
  def _calculate_completion(self):
509
- """Calculate overall completion status with more detailed info"""
510
  total_required = sum(req["required"] for req in self.requirements.values())
511
  total_completed = sum(req["completed"] for req in self.requirements.values())
512
-
513
  self.graduation_status.update({
514
  "total_required_credits": total_required,
515
  "total_completed_credits": total_completed,
516
- "percent_complete": round((total_completed / total_required) * 100, 1),
517
  "remaining_credits": total_required - total_completed,
518
- "on_track": (total_completed / total_required) >= 0.75 # 75% completion considered on track
519
  })
520
-
521
- def to_json(self) -> str:
522
- """Export parsed data as JSON"""
523
- return json.dumps({
524
- "student_info": self.student_data,
525
- "requirements": self.requirements,
526
- "current_courses": self.current_courses,
527
- "course_history": self.course_history,
528
- "graduation_status": self.graduation_status
529
- }, indent=2)
530
 
531
  def format_transcript_output(data: Dict) -> str:
532
  """Enhanced formatting for transcript output with format awareness"""
 
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
26
+ import pdfplumber
27
 
28
  # ========== CONFIGURATION ==========
29
  PROFILES_DIR = "student_profiles"
 
197
 
198
  try:
199
  if file_ext == '.pdf':
200
+ # First try pdfplumber for better text extraction
201
  try:
202
+ with pdfplumber.open(file_path) as pdf:
203
+ text = "\n".join([page.extract_text() for page in pdf.pages])
204
+ if not text.strip():
205
+ raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
206
+ except Exception as e:
207
+ logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
208
  doc = fitz.open(file_path)
209
  for page in doc:
210
  text += page.get_text("text") + '\n'
211
  if not text.strip():
212
+ raise ValueError("PyMuPDF returned empty text - trying OCR fallback...")
213
+ text = extract_text_from_pdf_with_ocr(file_path)
 
 
214
 
215
  elif file_ext in ['.png', '.jpg', '.jpeg']:
216
  text = extract_text_with_ocr(file_path)
 
298
  self.current_courses = []
299
  self.course_history = []
300
  self.graduation_status = {}
 
 
 
 
 
301
 
302
  def parse_transcript(self, text: str) -> Dict:
303
+ """Parse Miami-Dade formatted transcripts with updated regex patterns."""
304
  try:
305
+ # Extract student info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  student_match = re.search(
307
+ r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
 
 
308
  text, re.DOTALL
309
  )
310
+
311
  if student_match:
312
  self.student_data = {
313
+ "id": student_match.group(1).strip(),
314
+ "name": student_match.group(2).replace(",", ", ").strip(),
315
  "current_grade": student_match.group(3),
316
  "graduation_year": student_match.group(4),
317
  "unweighted_gpa": float(student_match.group(5)),
 
319
  "total_credits": float(student_match.group(7)),
320
  "community_service_hours": int(student_match.group(8))
321
  }
322
+
323
+ # Extract requirements
324
+ self.requirements = {}
325
  req_section = re.search(
326
+ r"Code Description Required Waived Completed Status(.*?)Total\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+%",
327
  text, re.DOTALL
328
  )
 
329
  if req_section:
330
+ req_lines = req_section.group(1).strip().splitlines()
331
+ for line in req_lines:
332
+ req_match = re.match(r"([A-Z]-[^\s]+)\s+(.+?)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+)%", line.strip())
333
+ if req_match:
334
+ code = req_match.group(1).strip()
335
+ self.requirements[code] = {
336
+ "description": req_match.group(2).strip(),
337
+ "required": float(req_match.group(3)),
338
+ "waived": float(req_match.group(4)),
339
+ "completed": float(req_match.group(5)),
340
+ "status": f"{req_match.group(6)}%"
341
+ }
342
+
343
+ # Extract course history (simplified for now)
344
+ self.course_history = []
345
+ course_pattern = re.compile(
346
+ r"([A-Z]-[^\s]+)\s+(\d{4}-\d{4}|\d{4})\s+(\d{2})\s+([A-Z0-9]+)\s+(.+?)\s+([AT12]+)\s+([A-Z0-9]+)?\s+([A-Z])?\s+([A-Z])?\s+(inProgress|\d+\.\d+)",
347
+ re.DOTALL
 
348
  )
349
+ for match in course_pattern.finditer(text):
350
+ self.course_history.append({
351
+ "requirement_category": match.group(1),
352
+ "school_year": match.group(2),
353
+ "grade_level": match.group(3),
354
+ "course_code": match.group(4),
355
+ "description": match.group(5).strip(),
356
+ "term": match.group(6),
357
+ "district_number": match.group(7),
358
+ "grade": match.group(8),
359
+ "inclusion_status": match.group(9),
360
+ "credits": match.group(10)
361
+ })
362
+
363
+ # Extract in-progress
 
 
 
 
 
 
 
364
  self._extract_current_courses()
365
  self._calculate_completion()
366
+
367
  return {
368
  "student_info": self.student_data,
369
  "requirements": self.requirements,
 
372
  "graduation_status": self.graduation_status,
373
  "format": "miami_dade"
374
  }
 
 
 
 
375
 
376
+ except Exception as e:
377
+ logging.error(f"Error parsing transcript: {str(e)}")
378
+ raise ValueError(f"Couldn't parse transcript: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
  def _extract_current_courses(self):
381
+ """Identify in-progress courses."""
382
  self.current_courses = [
383
  {
384
  "course": c["description"],
 
388
  "credits": c["credits"],
389
  "grade_level": c["grade_level"]
390
  }
391
+ for c in self.course_history if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
 
392
  ]
393
+
394
  def _calculate_completion(self):
395
+ """Compute graduation readiness."""
396
  total_required = sum(req["required"] for req in self.requirements.values())
397
  total_completed = sum(req["completed"] for req in self.requirements.values())
398
+
399
  self.graduation_status.update({
400
  "total_required_credits": total_required,
401
  "total_completed_credits": total_completed,
402
+ "percent_complete": round((total_completed / total_required) * 100, 1) if total_required > 0 else 0,
403
  "remaining_credits": total_required - total_completed,
404
+ "on_track": (total_completed / total_required) >= 0.75 if total_required > 0 else False
405
  })
 
 
 
 
 
 
 
 
 
 
406
 
407
  def format_transcript_output(data: Dict) -> str:
408
  """Enhanced formatting for transcript output with format awareness"""