Dannyar608 commited on
Commit
5c437e2
·
verified ·
1 Parent(s): b02a8be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -15
app.py CHANGED
@@ -23,6 +23,7 @@ import asyncio
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
 
26
 
27
  # ========== CONFIGURATION ==========
28
  PROFILES_DIR = "student_profiles"
@@ -244,6 +245,33 @@ def remove_sensitive_info(text: str) -> str:
244
  return text
245
 
246
  # ========== TRANSCRIPT PARSING ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  class TranscriptParser:
248
  def __init__(self):
249
  self.student_data = {}
@@ -253,27 +281,98 @@ class TranscriptParser:
253
  self.graduation_status = {}
254
 
255
  def parse_transcript(self, text: str) -> Dict:
256
- """Simplified transcript parser that extracts key information"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  try:
258
  parsed_data = {
259
  'student_info': {},
260
- 'course_history': []
 
 
261
  }
262
 
263
- # Extract student information
264
- name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
265
- if name_match:
266
- parsed_data['student_info']['name'] = name_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
- id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
269
- if id_match:
270
- parsed_data['student_info']['id'] = id_match.group(1).strip()
271
 
272
- gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
273
- if gpa_match:
274
- parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
275
 
276
- # Extract courses (simplified pattern)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
278
  courses = re.findall(course_pattern, text)
279
  for course in courses:
@@ -287,8 +386,41 @@ class TranscriptParser:
287
  return parsed_data
288
 
289
  except Exception as e:
290
- logging.error(f"Error parsing transcript: {str(e)}")
291
- raise ValueError(f"Couldn't parse transcript: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
294
  """Process transcript file and return simple confirmation"""
 
23
  from functools import lru_cache
24
  import hashlib
25
  from concurrent.futures import ThreadPoolExecutor
26
+ from pydantic import BaseModel
27
 
28
  # ========== CONFIGURATION ==========
29
  PROFILES_DIR = "student_profiles"
 
245
  return text
246
 
247
  # ========== TRANSCRIPT PARSING ==========
248
+ class Course(BaseModel):
249
+ requirement: str
250
+ school_year: str
251
+ grade_level: str
252
+ course_code: str
253
+ description: str
254
+ term: str
255
+ district_number: str
256
+ fg: str
257
+ included: str
258
+ credits: str
259
+
260
+ class GraduationProgress(BaseModel):
261
+ student_name: str
262
+ student_id: str
263
+ current_grade: str
264
+ year_of_graduation: str
265
+ unweighted_gpa: float
266
+ weighted_gpa: float
267
+ community_service_hours: int
268
+ community_service_date: str
269
+ total_credits_earned: float
270
+ virtual_grade: str
271
+ requirements: Dict[str, Dict[str, float]]
272
+ courses: List[Course]
273
+ assessments: Dict[str, str]
274
+
275
  class TranscriptParser:
276
  def __init__(self):
277
  self.student_data = {}
 
281
  self.graduation_status = {}
282
 
283
  def parse_transcript(self, text: str) -> Dict:
284
+ """Parse transcript text and return structured data"""
285
+ try:
286
+ # First try the new detailed parser
287
+ parsed_data = self._parse_detailed_transcript(text)
288
+ if parsed_data:
289
+ return parsed_data
290
+
291
+ # Fall back to simplified parser if detailed parsing fails
292
+ return self._parse_simplified_transcript(text)
293
+
294
+ except Exception as e:
295
+ logging.error(f"Error parsing transcript: {str(e)}")
296
+ raise ValueError(f"Couldn't parse transcript: {str(e)}")
297
+
298
+ def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
299
+ """Parse detailed transcript format"""
300
  try:
301
  parsed_data = {
302
  'student_info': {},
303
+ 'requirements': {},
304
+ 'course_history': [],
305
+ 'assessments': {}
306
  }
307
 
308
+ # Extract student info
309
+ student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
310
+ if student_info_match:
311
+ parsed_data['student_info']['id'] = student_info_match.group(1)
312
+ parsed_data['student_info']['name'] = student_info_match.group(2).strip()
313
+
314
+ current_grade_match = re.search(r"Current Grade: (\d+)", text)
315
+ if current_grade_match:
316
+ parsed_data['student_info']['grade'] = current_grade_match.group(1)
317
+
318
+ yog_match = re.search(r"YOG (\d{4})", text)
319
+ if yog_match:
320
+ parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
321
+
322
+ unweighted_gpa_match = re.search(r"Un-weighted GPA (\d+\.\d+)", text)
323
+ if unweighted_gpa_match:
324
+ parsed_data['student_info']['unweighted_gpa'] = float(unweighted_gpa_match.group(1))
325
+
326
+ weighted_gpa_match = re.search(r"Weighted GPA (\d+\.\d+)", text)
327
+ if weighted_gpa_match:
328
+ parsed_data['student_info']['weighted_gpa'] = float(weighted_gpa_match.group(1))
329
+
330
+ service_hours_match = re.search(r"Comm Serv Hours (\d+)", text)
331
+ if service_hours_match:
332
+ parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
333
 
334
+ service_date_match = re.search(r"Comm Serv Date (\d{2}/\d{2}/\d{4})", text)
335
+ if service_date_match:
336
+ parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
337
 
338
+ credits_match = re.search(r"Total Credits Earned (\d+\.\d+)", text)
339
+ if credits_match:
340
+ parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
341
 
342
+ virtual_grade_match = re.search(r"Virtual Grade (\w+)", text)
343
+ if virtual_grade_match:
344
+ parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
345
+
346
+ # Extract requirements
347
+ req_pattern = re.compile(r"([A-Z]-.*?)\s*\|\s*(.*?)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+) %")
348
+ for match in req_pattern.finditer(text):
349
+ code = match.group(1).strip()
350
+ desc = match.group(2).strip()
351
+ required = float(match.group(3))
352
+ waived = float(match.group(4))
353
+ completed = float(match.group(5))
354
+ percent = float(match.group(6))
355
+ parsed_data['requirements'][code] = {
356
+ "description": desc,
357
+ "required": required,
358
+ "waived": waived,
359
+ "completed": completed,
360
+ "percent_complete": percent
361
+ }
362
+
363
+ # Extract assessments
364
+ assess_pattern = re.compile(r"Z-Assessment: (.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %")
365
+ for match in assess_pattern.finditer(text):
366
+ name = f"Assessment: {match.group(1)}"
367
+ status = match.group(3)
368
+ parsed_data['assessments'][name] = status
369
+
370
+ for z_item in ["Community Service Hours", "GPA"]:
371
+ if re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text):
372
+ status = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text).group(2)
373
+ parsed_data['assessments'][z_item] = status
374
+
375
+ # Extract courses (simplified for now - can be enhanced)
376
  course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
377
  courses = re.findall(course_pattern, text)
378
  for course in courses:
 
386
  return parsed_data
387
 
388
  except Exception as e:
389
+ logging.warning(f"Detailed transcript parsing failed, falling back to simple parser: {str(e)}")
390
+ return None
391
+
392
+ def _parse_simplified_transcript(self, text: str) -> Dict:
393
+ """Fallback simplified transcript parser that extracts key information"""
394
+ parsed_data = {
395
+ 'student_info': {},
396
+ 'course_history': []
397
+ }
398
+
399
+ # Extract student information
400
+ name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
401
+ if name_match:
402
+ parsed_data['student_info']['name'] = name_match.group(1).strip()
403
+
404
+ id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
405
+ if id_match:
406
+ parsed_data['student_info']['id'] = id_match.group(1).strip()
407
+
408
+ gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
409
+ if gpa_match:
410
+ parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
411
+
412
+ # Extract courses (simplified pattern)
413
+ course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
414
+ courses = re.findall(course_pattern, text)
415
+ for course in courses:
416
+ parsed_data['course_history'].append({
417
+ 'course_code': course[0],
418
+ 'description': course[1],
419
+ 'grade': course[2],
420
+ 'credits': float(course[3])
421
+ })
422
+
423
+ return parsed_data
424
 
425
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
426
  """Process transcript file and return simple confirmation"""