Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -23,6 +23,7 @@ import asyncio
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
26 |
|
27 |
# ========== CONFIGURATION ==========
|
28 |
PROFILES_DIR = "student_profiles"
|
@@ -244,6 +245,33 @@ def remove_sensitive_info(text: str) -> str:
|
|
244 |
return text
|
245 |
|
246 |
# ========== TRANSCRIPT PARSING ==========
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
class TranscriptParser:
|
248 |
def __init__(self):
|
249 |
self.student_data = {}
|
@@ -253,27 +281,98 @@ class TranscriptParser:
|
|
253 |
self.graduation_status = {}
|
254 |
|
255 |
def parse_transcript(self, text: str) -> Dict:
|
256 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
try:
|
258 |
parsed_data = {
|
259 |
'student_info': {},
|
260 |
-
'
|
|
|
|
|
261 |
}
|
262 |
|
263 |
-
# Extract student
|
264 |
-
|
265 |
-
if
|
266 |
-
parsed_data['student_info']['
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
|
268 |
-
|
269 |
-
if
|
270 |
-
parsed_data['student_info']['
|
271 |
|
272 |
-
|
273 |
-
if
|
274 |
-
parsed_data['student_info']['
|
275 |
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
278 |
courses = re.findall(course_pattern, text)
|
279 |
for course in courses:
|
@@ -287,8 +386,41 @@ class TranscriptParser:
|
|
287 |
return parsed_data
|
288 |
|
289 |
except Exception as e:
|
290 |
-
logging.
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
|
293 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
294 |
"""Process transcript file and return simple confirmation"""
|
|
|
23 |
from functools import lru_cache
|
24 |
import hashlib
|
25 |
from concurrent.futures import ThreadPoolExecutor
|
26 |
+
from pydantic import BaseModel
|
27 |
|
28 |
# ========== CONFIGURATION ==========
|
29 |
PROFILES_DIR = "student_profiles"
|
|
|
245 |
return text
|
246 |
|
247 |
# ========== TRANSCRIPT PARSING ==========
|
248 |
+
class Course(BaseModel):
|
249 |
+
requirement: str
|
250 |
+
school_year: str
|
251 |
+
grade_level: str
|
252 |
+
course_code: str
|
253 |
+
description: str
|
254 |
+
term: str
|
255 |
+
district_number: str
|
256 |
+
fg: str
|
257 |
+
included: str
|
258 |
+
credits: str
|
259 |
+
|
260 |
+
class GraduationProgress(BaseModel):
|
261 |
+
student_name: str
|
262 |
+
student_id: str
|
263 |
+
current_grade: str
|
264 |
+
year_of_graduation: str
|
265 |
+
unweighted_gpa: float
|
266 |
+
weighted_gpa: float
|
267 |
+
community_service_hours: int
|
268 |
+
community_service_date: str
|
269 |
+
total_credits_earned: float
|
270 |
+
virtual_grade: str
|
271 |
+
requirements: Dict[str, Dict[str, float]]
|
272 |
+
courses: List[Course]
|
273 |
+
assessments: Dict[str, str]
|
274 |
+
|
275 |
class TranscriptParser:
|
276 |
def __init__(self):
|
277 |
self.student_data = {}
|
|
|
281 |
self.graduation_status = {}
|
282 |
|
283 |
def parse_transcript(self, text: str) -> Dict:
|
284 |
+
"""Parse transcript text and return structured data"""
|
285 |
+
try:
|
286 |
+
# First try the new detailed parser
|
287 |
+
parsed_data = self._parse_detailed_transcript(text)
|
288 |
+
if parsed_data:
|
289 |
+
return parsed_data
|
290 |
+
|
291 |
+
# Fall back to simplified parser if detailed parsing fails
|
292 |
+
return self._parse_simplified_transcript(text)
|
293 |
+
|
294 |
+
except Exception as e:
|
295 |
+
logging.error(f"Error parsing transcript: {str(e)}")
|
296 |
+
raise ValueError(f"Couldn't parse transcript: {str(e)}")
|
297 |
+
|
298 |
+
def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
|
299 |
+
"""Parse detailed transcript format"""
|
300 |
try:
|
301 |
parsed_data = {
|
302 |
'student_info': {},
|
303 |
+
'requirements': {},
|
304 |
+
'course_history': [],
|
305 |
+
'assessments': {}
|
306 |
}
|
307 |
|
308 |
+
# Extract student info
|
309 |
+
student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
|
310 |
+
if student_info_match:
|
311 |
+
parsed_data['student_info']['id'] = student_info_match.group(1)
|
312 |
+
parsed_data['student_info']['name'] = student_info_match.group(2).strip()
|
313 |
+
|
314 |
+
current_grade_match = re.search(r"Current Grade: (\d+)", text)
|
315 |
+
if current_grade_match:
|
316 |
+
parsed_data['student_info']['grade'] = current_grade_match.group(1)
|
317 |
+
|
318 |
+
yog_match = re.search(r"YOG (\d{4})", text)
|
319 |
+
if yog_match:
|
320 |
+
parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
|
321 |
+
|
322 |
+
unweighted_gpa_match = re.search(r"Un-weighted GPA (\d+\.\d+)", text)
|
323 |
+
if unweighted_gpa_match:
|
324 |
+
parsed_data['student_info']['unweighted_gpa'] = float(unweighted_gpa_match.group(1))
|
325 |
+
|
326 |
+
weighted_gpa_match = re.search(r"Weighted GPA (\d+\.\d+)", text)
|
327 |
+
if weighted_gpa_match:
|
328 |
+
parsed_data['student_info']['weighted_gpa'] = float(weighted_gpa_match.group(1))
|
329 |
+
|
330 |
+
service_hours_match = re.search(r"Comm Serv Hours (\d+)", text)
|
331 |
+
if service_hours_match:
|
332 |
+
parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
|
333 |
|
334 |
+
service_date_match = re.search(r"Comm Serv Date (\d{2}/\d{2}/\d{4})", text)
|
335 |
+
if service_date_match:
|
336 |
+
parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
|
337 |
|
338 |
+
credits_match = re.search(r"Total Credits Earned (\d+\.\d+)", text)
|
339 |
+
if credits_match:
|
340 |
+
parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
|
341 |
|
342 |
+
virtual_grade_match = re.search(r"Virtual Grade (\w+)", text)
|
343 |
+
if virtual_grade_match:
|
344 |
+
parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
|
345 |
+
|
346 |
+
# Extract requirements
|
347 |
+
req_pattern = re.compile(r"([A-Z]-.*?)\s*\|\s*(.*?)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+) %")
|
348 |
+
for match in req_pattern.finditer(text):
|
349 |
+
code = match.group(1).strip()
|
350 |
+
desc = match.group(2).strip()
|
351 |
+
required = float(match.group(3))
|
352 |
+
waived = float(match.group(4))
|
353 |
+
completed = float(match.group(5))
|
354 |
+
percent = float(match.group(6))
|
355 |
+
parsed_data['requirements'][code] = {
|
356 |
+
"description": desc,
|
357 |
+
"required": required,
|
358 |
+
"waived": waived,
|
359 |
+
"completed": completed,
|
360 |
+
"percent_complete": percent
|
361 |
+
}
|
362 |
+
|
363 |
+
# Extract assessments
|
364 |
+
assess_pattern = re.compile(r"Z-Assessment: (.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %")
|
365 |
+
for match in assess_pattern.finditer(text):
|
366 |
+
name = f"Assessment: {match.group(1)}"
|
367 |
+
status = match.group(3)
|
368 |
+
parsed_data['assessments'][name] = status
|
369 |
+
|
370 |
+
for z_item in ["Community Service Hours", "GPA"]:
|
371 |
+
if re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text):
|
372 |
+
status = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text).group(2)
|
373 |
+
parsed_data['assessments'][z_item] = status
|
374 |
+
|
375 |
+
# Extract courses (simplified for now - can be enhanced)
|
376 |
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
377 |
courses = re.findall(course_pattern, text)
|
378 |
for course in courses:
|
|
|
386 |
return parsed_data
|
387 |
|
388 |
except Exception as e:
|
389 |
+
logging.warning(f"Detailed transcript parsing failed, falling back to simple parser: {str(e)}")
|
390 |
+
return None
|
391 |
+
|
392 |
+
def _parse_simplified_transcript(self, text: str) -> Dict:
|
393 |
+
"""Fallback simplified transcript parser that extracts key information"""
|
394 |
+
parsed_data = {
|
395 |
+
'student_info': {},
|
396 |
+
'course_history': []
|
397 |
+
}
|
398 |
+
|
399 |
+
# Extract student information
|
400 |
+
name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
|
401 |
+
if name_match:
|
402 |
+
parsed_data['student_info']['name'] = name_match.group(1).strip()
|
403 |
+
|
404 |
+
id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
|
405 |
+
if id_match:
|
406 |
+
parsed_data['student_info']['id'] = id_match.group(1).strip()
|
407 |
+
|
408 |
+
gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
|
409 |
+
if gpa_match:
|
410 |
+
parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
|
411 |
+
|
412 |
+
# Extract courses (simplified pattern)
|
413 |
+
course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
|
414 |
+
courses = re.findall(course_pattern, text)
|
415 |
+
for course in courses:
|
416 |
+
parsed_data['course_history'].append({
|
417 |
+
'course_code': course[0],
|
418 |
+
'description': course[1],
|
419 |
+
'grade': course[2],
|
420 |
+
'credits': float(course[3])
|
421 |
+
})
|
422 |
+
|
423 |
+
return parsed_data
|
424 |
|
425 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
426 |
"""Process transcript file and return simple confirmation"""
|