Dannyar608 commited on
Commit
e96bacb
·
verified ·
1 Parent(s): 8df5e79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -36
app.py CHANGED
@@ -180,7 +180,15 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
180
  import pdfplumber
181
  with pdfplumber.open(file_path) as pdf:
182
  for page in pdf.pages:
183
- text += page.extract_text() + '\n'
 
 
 
 
 
 
 
 
184
  if not text.strip():
185
  raise ValueError("PDFPlumber returned empty text")
186
  except Exception as e:
@@ -311,10 +319,10 @@ class TranscriptParser:
311
 
312
  except Exception as e:
313
  logging.error(f"Error parsing transcript: {str(e)}")
314
- raise ValueError(f"Couldn't parse transcript: {str(e)}")
315
 
316
  def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
317
- """Parse detailed transcript format"""
318
  try:
319
  parsed_data = {
320
  'student_info': {},
@@ -323,46 +331,48 @@ class TranscriptParser:
323
  'assessments': {}
324
  }
325
 
326
- # Extract student info
327
  student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
328
  if student_info_match:
329
  parsed_data['student_info']['id'] = student_info_match.group(1)
330
  parsed_data['student_info']['name'] = student_info_match.group(2).strip()
331
 
332
- current_grade_match = re.search(r"Current Grade: (\d+)", text)
 
333
  if current_grade_match:
334
  parsed_data['student_info']['grade'] = current_grade_match.group(1)
335
 
336
- yog_match = re.search(r"YOG (\d{4})", text)
337
  if yog_match:
338
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
339
 
340
- unweighted_gpa_match = re.search(r"Un-weighted GPA (\d+\.\d+)", text)
341
- if unweighted_gpa_match:
342
- parsed_data['student_info']['unweighted_gpa'] = float(unweighted_gpa_match.group(1))
343
-
344
- weighted_gpa_match = re.search(r"Weighted GPA (\d+\.\d+)", text)
345
- if weighted_gpa_match:
346
- parsed_data['student_info']['weighted_gpa'] = float(weighted_gpa_match.group(1))
347
 
348
- service_hours_match = re.search(r"Comm Serv Hours (\d+)", text)
 
349
  if service_hours_match:
350
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
351
 
352
- service_date_match = re.search(r"Comm Serv Date (\d{2}/\d{2}/\d{4})", text)
353
  if service_date_match:
354
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
355
 
356
- credits_match = re.search(r"Total Credits Earned (\d+\.\d+)", text)
 
357
  if credits_match:
358
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
359
 
360
- virtual_grade_match = re.search(r"Virtual Grade (\w+)", text)
 
361
  if virtual_grade_match:
362
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
363
 
364
- # Extract requirements
365
- req_pattern = re.compile(r"([A-Z]-.*?)\s*\|\s*(.*?)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+\.\d+)\s*\|\s*(\d+) %")
366
  for match in req_pattern.finditer(text):
367
  code = match.group(1).strip()
368
  desc = match.group(2).strip()
@@ -378,33 +388,45 @@ class TranscriptParser:
378
  "percent_complete": percent
379
  }
380
 
381
- # Extract assessments
382
- assess_pattern = re.compile(r"Z-Assessment: (.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %")
383
  for match in assess_pattern.finditer(text):
384
- name = f"Assessment: {match.group(1)}"
385
- status = match.group(3)
386
  parsed_data['assessments'][name] = status
387
 
 
388
  for z_item in ["Community Service Hours", "GPA"]:
389
- if re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text):
390
- status = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+) %", text).group(2)
 
391
  parsed_data['assessments'][z_item] = status
392
 
393
- # Extract courses (simplified for now - can be enhanced)
394
- course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
395
- courses = re.findall(course_pattern, text)
396
- for course in courses:
397
- parsed_data['course_history'].append({
398
- 'course_code': course[0],
399
- 'description': course[1],
400
- 'grade': course[2],
401
- 'credits': float(course[3])
402
- })
 
 
 
 
 
 
 
 
 
 
403
 
404
  return parsed_data
405
 
406
  except Exception as e:
407
- logging.warning(f"Detailed transcript parsing failed, falling back to simple parser: {str(e)}")
408
  return None
409
 
410
  def _parse_simplified_transcript(self, text: str) -> Dict:
@@ -676,6 +698,10 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
676
  logging.error(error_msg)
677
  raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
678
 
 
 
 
 
679
  # ========== LEARNING STYLE QUIZ ==========
680
  class LearningStyleQuiz:
681
  def __init__(self):
 
180
  import pdfplumber
181
  with pdfplumber.open(file_path) as pdf:
182
  for page in pdf.pages:
183
+ # Try tables first
184
+ tables = page.extract_tables()
185
+ if tables:
186
+ for table in tables:
187
+ text += "\n".join([" | ".join(str(cell) for cell in row if cell is not None]) + "\n"
188
+ # Fall back to text extraction
189
+ page_text = page.extract_text()
190
+ if page_text:
191
+ text += page_text + "\n"
192
  if not text.strip():
193
  raise ValueError("PDFPlumber returned empty text")
194
  except Exception as e:
 
319
 
320
  except Exception as e:
321
  logging.error(f"Error parsing transcript: {str(e)}")
322
+ raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
323
 
324
  def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
325
+ """Parse detailed transcript format with improved patterns"""
326
  try:
327
  parsed_data = {
328
  'student_info': {},
 
331
  'assessments': {}
332
  }
333
 
334
+ # Extract student info with more flexible patterns
335
  student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
336
  if student_info_match:
337
  parsed_data['student_info']['id'] = student_info_match.group(1)
338
  parsed_data['student_info']['name'] = student_info_match.group(2).strip()
339
 
340
+ # More flexible grade and year extraction
341
+ current_grade_match = re.search(r"Current Grade:\s*(\d+)", text, re.IGNORECASE)
342
  if current_grade_match:
343
  parsed_data['student_info']['grade'] = current_grade_match.group(1)
344
 
345
+ yog_match = re.search(r"YOG\s*(\d{4})", text, re.IGNORECASE)
346
  if yog_match:
347
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
348
 
349
+ # Improved GPA extraction
350
+ gpa_matches = re.findall(r"(?:Un-weighted|Weighted)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
351
+ if len(gpa_matches) >= 2:
352
+ parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
353
+ parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
 
 
354
 
355
+ # Community service info
356
+ service_hours_match = re.search(r"Comm\s*Serv\s*Hours\s*(\d+)", text, re.IGNORECASE)
357
  if service_hours_match:
358
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
359
 
360
+ service_date_match = re.search(r"Comm\s*Serv\s*Date\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
361
  if service_date_match:
362
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
363
 
364
+ # Credits info
365
+ credits_match = re.search(r"Total\s*Credits\s*Earned\s*([\d.]+)", text, re.IGNORECASE)
366
  if credits_match:
367
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
368
 
369
+ # Virtual grade
370
+ virtual_grade_match = re.search(r"Virtual\s*Grade\s*(\w+)", text, re.IGNORECASE)
371
  if virtual_grade_match:
372
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
373
 
374
+ # Extract requirements with improved pattern
375
+ req_pattern = re.compile(r"([A-Z]-[^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d]+)\s*%")
376
  for match in req_pattern.finditer(text):
377
  code = match.group(1).strip()
378
  desc = match.group(2).strip()
 
388
  "percent_complete": percent
389
  }
390
 
391
+ # Extract assessments with more flexible pattern
392
+ assess_pattern = re.compile(r"Z-Assessment:\s*(.*?)\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%")
393
  for match in assess_pattern.finditer(text):
394
+ name = f"Assessment: {match.group(1).strip()}"
395
+ status = match.group(3).strip()
396
  parsed_data['assessments'][name] = status
397
 
398
+ # Handle other Z items
399
  for z_item in ["Community Service Hours", "GPA"]:
400
+ z_match = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%", text, re.IGNORECASE)
401
+ if z_match:
402
+ status = z_match.group(2).strip()
403
  parsed_data['assessments'][z_item] = status
404
 
405
+ # Extract course history with more robust pattern
406
+ course_history_section = re.search(r"Requirement.*?School Year.*?GradeLv1.*?CrsNum.*?Description.*?Term.*?DstNumber.*?FG.*?Incl.*?Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
407
+ if course_history_section:
408
+ course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip()]
409
+ for line in course_lines:
410
+ parts = [part.strip() for part in line.split('|')]
411
+ if len(parts) >= 9:
412
+ course = {
413
+ 'requirement': parts[0],
414
+ 'school_year': parts[1],
415
+ 'grade_level': parts[2],
416
+ 'course_code': parts[3],
417
+ 'description': parts[4],
418
+ 'term': parts[5],
419
+ 'district_number': parts[6],
420
+ 'fg': parts[7],
421
+ 'included': parts[8],
422
+ 'credits': parts[9] if len(parts) > 9 else "0"
423
+ }
424
+ parsed_data['course_history'].append(course)
425
 
426
  return parsed_data
427
 
428
  except Exception as e:
429
+ logging.warning(f"Detailed transcript parsing failed: {str(e)}")
430
  return None
431
 
432
  def _parse_simplified_transcript(self, text: str) -> Dict:
 
698
  logging.error(error_msg)
699
  raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
700
 
701
+ # [Rest of your code remains exactly the same...]
702
+ # Continue with LearningStyleQuiz, ProfileManager, TeachingAssistant classes
703
+ # and the create_interface() function exactly as you had them
704
+
705
  # ========== LEARNING STYLE QUIZ ==========
706
  class LearningStyleQuiz:
707
  def __init__(self):