Dannyar608 commited on
Commit
929de97
·
verified ·
1 Parent(s): c3e05e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -17
app.py CHANGED
@@ -282,7 +282,7 @@ def remove_sensitive_info(text: str) -> str:
282
  # Remove student IDs (assuming 6-9 digit numbers)
283
  text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
284
  # Remove email addresses
285
- text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
286
  return text
287
 
288
  # ========== TRANSCRIPT PARSING ==========
@@ -322,7 +322,7 @@ class TranscriptParser:
322
  def detect_format(self, text: str) -> str:
323
  """Detect the transcript format"""
324
  # Check for Miami-Dade specific patterns
325
- if re.search(r'MIAMI-DADE SCHOOL DISTRICT', text, re.IGNORECASE):
326
  return 'miami_dade'
327
  # Check for homeschool patterns
328
  elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
@@ -334,35 +334,35 @@ class TranscriptParser:
334
  """Parse Miami-Dade formatted transcripts with enhanced error handling"""
335
  try:
336
  # Extract student info with more robust patterns
337
- student_info = re.search(
338
  r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
339
  r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
340
  r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
341
  text, re.DOTALL
342
  )
343
 
344
- if student_info:
345
  self.student_data = {
346
- "id": student_info.group(1),
347
- "name": student_info.group(2).replace(",", ", "),
348
- "current_grade": student_info.group(3),
349
- "graduation_year": student_info.group(4),
350
- "unweighted_gpa": float(student_info.group(5)),
351
- "weighted_gpa": float(student_info.group(6)),
352
- "total_credits": float(student_info.group(7)),
353
- "community_service_hours": int(student_info.group(8))
354
  }
355
 
356
  # Extract requirements with better table parsing
357
- req_table = re.search(
358
  r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
359
  text, re.DOTALL
360
  )
361
 
362
- if req_table:
363
  req_matches = re.finditer(
364
  r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
365
- req_table.group(1)
366
  )
367
 
368
  for match in req_matches:
@@ -376,7 +376,10 @@ class TranscriptParser:
376
  }
377
 
378
  # Extract course history with more flexible parsing
379
- course_section = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description(.*?)(?=Legend for Incl:|$)", text, re.DOTALL)
 
 
 
380
 
381
  if course_section:
382
  course_matches = re.finditer(
@@ -1893,5 +1896,4 @@ app = create_interface()
1893
 
1894
  if __name__ == "__main__":
1895
  app.launch()
1896
-
1897
 
 
282
  # Remove student IDs (assuming 6-9 digit numbers)
283
  text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
284
  # Remove email addresses
285
+ text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
286
  return text
287
 
288
  # ========== TRANSCRIPT PARSING ==========
 
322
  def detect_format(self, text: str) -> str:
323
  """Detect the transcript format"""
324
  # Check for Miami-Dade specific patterns
325
+ if re.search(r'MIAMI-DADE (COUNTY|COUNTRY) PUBLIC SCHOOLS', text, re.IGNORECASE):
326
  return 'miami_dade'
327
  # Check for homeschool patterns
328
  elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
 
334
  """Parse Miami-Dade formatted transcripts with enhanced error handling"""
335
  try:
336
  # Extract student info with more robust patterns
337
+ student_match = re.search(
338
  r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
339
  r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
340
  r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
341
  text, re.DOTALL
342
  )
343
 
344
+ if student_match:
345
  self.student_data = {
346
+ "id": student_match.group(1),
347
+ "name": student_match.group(2).replace(",", ", "),
348
+ "current_grade": student_match.group(3),
349
+ "graduation_year": student_match.group(4),
350
+ "unweighted_gpa": float(student_match.group(5)),
351
+ "weighted_gpa": float(student_match.group(6)),
352
+ "total_credits": float(student_match.group(7)),
353
+ "community_service_hours": int(student_match.group(8))
354
  }
355
 
356
  # Extract requirements with better table parsing
357
+ req_section = re.search(
358
  r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
359
  text, re.DOTALL
360
  )
361
 
362
+ if req_section:
363
  req_matches = re.finditer(
364
  r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
365
+ req_section.group(1)
366
  )
367
 
368
  for match in req_matches:
 
376
  }
377
 
378
  # Extract course history with more flexible parsing
379
+ course_section = re.search(
380
+ r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description(.*?)(?=Legend for Incl:|$)",
381
+ text, re.DOTALL
382
+ )
383
 
384
  if course_section:
385
  course_matches = re.finditer(
 
1896
 
1897
  if __name__ == "__main__":
1898
  app.launch()
 
1899