Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -282,7 +282,7 @@ def remove_sensitive_info(text: str) -> str:
|
|
282 |
# Remove student IDs (assuming 6-9 digit numbers)
|
283 |
text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
|
284 |
# Remove email addresses
|
285 |
-
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-
|
286 |
return text
|
287 |
|
288 |
# ========== TRANSCRIPT PARSING ==========
|
@@ -322,7 +322,7 @@ class TranscriptParser:
|
|
322 |
def detect_format(self, text: str) -> str:
|
323 |
"""Detect the transcript format"""
|
324 |
# Check for Miami-Dade specific patterns
|
325 |
-
if re.search(r'MIAMI-DADE
|
326 |
return 'miami_dade'
|
327 |
# Check for homeschool patterns
|
328 |
elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
|
@@ -334,35 +334,35 @@ class TranscriptParser:
|
|
334 |
"""Parse Miami-Dade formatted transcripts with enhanced error handling"""
|
335 |
try:
|
336 |
# Extract student info with more robust patterns
|
337 |
-
|
338 |
r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
|
339 |
r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
|
340 |
r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
|
341 |
text, re.DOTALL
|
342 |
)
|
343 |
|
344 |
-
if
|
345 |
self.student_data = {
|
346 |
-
"id":
|
347 |
-
"name":
|
348 |
-
"current_grade":
|
349 |
-
"graduation_year":
|
350 |
-
"unweighted_gpa": float(
|
351 |
-
"weighted_gpa": float(
|
352 |
-
"total_credits": float(
|
353 |
-
"community_service_hours": int(
|
354 |
}
|
355 |
|
356 |
# Extract requirements with better table parsing
|
357 |
-
|
358 |
r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
|
359 |
text, re.DOTALL
|
360 |
)
|
361 |
|
362 |
-
if
|
363 |
req_matches = re.finditer(
|
364 |
r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
|
365 |
-
|
366 |
)
|
367 |
|
368 |
for match in req_matches:
|
@@ -376,7 +376,10 @@ class TranscriptParser:
|
|
376 |
}
|
377 |
|
378 |
# Extract course history with more flexible parsing
|
379 |
-
course_section = re.search(
|
|
|
|
|
|
|
380 |
|
381 |
if course_section:
|
382 |
course_matches = re.finditer(
|
@@ -1893,5 +1896,4 @@ app = create_interface()
|
|
1893 |
|
1894 |
if __name__ == "__main__":
|
1895 |
app.launch()
|
1896 |
-
|
1897 |
|
|
|
282 |
# Remove student IDs (assuming 6-9 digit numbers)
|
283 |
text = re.sub(r'\b\d{6,9}\b', '[ID]', text)
|
284 |
# Remove email addresses
|
285 |
+
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
286 |
return text
|
287 |
|
288 |
# ========== TRANSCRIPT PARSING ==========
|
|
|
322 |
def detect_format(self, text: str) -> str:
|
323 |
"""Detect the transcript format"""
|
324 |
# Check for Miami-Dade specific patterns
|
325 |
+
if re.search(r'MIAMI-DADE (COUNTY|COUNTRY) PUBLIC SCHOOLS', text, re.IGNORECASE):
|
326 |
return 'miami_dade'
|
327 |
# Check for homeschool patterns
|
328 |
elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
|
|
|
334 |
"""Parse Miami-Dade formatted transcripts with enhanced error handling"""
|
335 |
try:
|
336 |
# Extract student info with more robust patterns
|
337 |
+
student_match = re.search(
|
338 |
r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
|
339 |
r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
|
340 |
r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
|
341 |
text, re.DOTALL
|
342 |
)
|
343 |
|
344 |
+
if student_match:
|
345 |
self.student_data = {
|
346 |
+
"id": student_match.group(1),
|
347 |
+
"name": student_match.group(2).replace(",", ", "),
|
348 |
+
"current_grade": student_match.group(3),
|
349 |
+
"graduation_year": student_match.group(4),
|
350 |
+
"unweighted_gpa": float(student_match.group(5)),
|
351 |
+
"weighted_gpa": float(student_match.group(6)),
|
352 |
+
"total_credits": float(student_match.group(7)),
|
353 |
+
"community_service_hours": int(student_match.group(8))
|
354 |
}
|
355 |
|
356 |
# Extract requirements with better table parsing
|
357 |
+
req_section = re.search(
|
358 |
r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
|
359 |
text, re.DOTALL
|
360 |
)
|
361 |
|
362 |
+
if req_section:
|
363 |
req_matches = re.finditer(
|
364 |
r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
|
365 |
+
req_section.group(1)
|
366 |
)
|
367 |
|
368 |
for match in req_matches:
|
|
|
376 |
}
|
377 |
|
378 |
# Extract course history with more flexible parsing
|
379 |
+
course_section = re.search(
|
380 |
+
r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description(.*?)(?=Legend for Incl:|$)",
|
381 |
+
text, re.DOTALL
|
382 |
+
)
|
383 |
|
384 |
if course_section:
|
385 |
course_matches = re.finditer(
|
|
|
1896 |
|
1897 |
if __name__ == "__main__":
|
1898 |
app.launch()
|
|
|
1899 |
|