Dannyar608 commited on
Commit
df3101e
Β·
verified Β·
1 Parent(s): e0ad8bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -92
app.py CHANGED
@@ -184,7 +184,6 @@ def validate_file(file_obj) -> None:
184
  def preprocess_text(text: str) -> str:
185
  """Normalize text for more reliable parsing"""
186
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
187
- text = text.replace('|', ' ') # Handle common OCR errors
188
  text = text.upper() # Standardize case for certain fields
189
  return text
190
 
@@ -198,29 +197,32 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
198
  import pdfplumber
199
  with pdfplumber.open(file_path) as pdf:
200
  for page in pdf.pages:
201
- # Try tables first
202
- tables = page.extract_tables()
 
 
 
 
 
203
  if tables:
204
  for table in tables:
205
- text += "\n".join(
206
- " | ".join(str(cell) for cell in row if cell is not None)
207
- for row in table
208
- ) + "\n"
209
- # Fall back to text extraction
210
  page_text = page.extract_text()
211
  if page_text:
212
  text += page_text + "\n"
 
213
  if not text.strip():
214
  raise ValueError("PDFPlumber returned empty text")
 
215
  except Exception as e:
216
  logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
217
  doc = fitz.open(file_path)
218
  for page in doc:
219
  text += page.get_text("text") + '\n'
220
- if not text.strip():
221
- logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
222
- text = extract_text_from_pdf_with_ocr(file_path)
223
-
224
  elif file_ext in ['.png', '.jpg', '.jpeg']:
225
  text = extract_text_with_ocr(file_path)
226
 
@@ -233,7 +235,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
233
 
234
  except Exception as e:
235
  logging.error(f"Text extraction error: {str(e)}")
236
- raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
237
 
238
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
239
  try:
@@ -271,18 +273,35 @@ def extract_text_with_ocr(file_path: str) -> str:
271
  raise ValueError(f"OCR processing failed: {str(e)}")
272
 
273
  def clean_extracted_text(text: str) -> str:
 
 
274
  text = re.sub(r'\s+', ' ', text).strip()
 
 
275
  replacements = {
276
- '|': 'I',
277
- 'β€˜': "'",
278
- '’': "'",
279
- 'β€œ': '"',
280
- '”': '"',
281
- 'fi': 'fi',
282
- 'fl': 'fl'
 
 
 
 
 
283
  }
284
- for wrong, right in replacements.items():
285
- text = text.replace(wrong, right)
 
 
 
 
 
 
 
 
286
  return text
287
 
288
  def remove_sensitive_info(text: str) -> str:
@@ -345,7 +364,7 @@ class TranscriptParser:
345
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
346
 
347
  def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
348
- """Parse detailed transcript format with improved patterns"""
349
  try:
350
  parsed_data = {
351
  'student_info': {},
@@ -354,102 +373,113 @@ class TranscriptParser:
354
  'assessments': {}
355
  }
356
 
357
- # Extract student info with more flexible patterns
358
  student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
359
  if student_info_match:
360
  parsed_data['student_info']['id'] = student_info_match.group(1)
361
  parsed_data['student_info']['name'] = student_info_match.group(2).strip()
362
 
363
- # More flexible grade and year extraction
364
- current_grade_match = re.search(r"Current Grade:\s*(\d+)", text, re.IGNORECASE)
365
- if current_grade_match:
366
- parsed_data['student_info']['grade'] = current_grade_match.group(1)
367
-
368
- yog_match = re.search(r"YOG\s*(\d{4})", text, re.IGNORECASE)
369
  if yog_match:
370
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
371
 
372
- # Improved GPA extraction with more flexible patterns
373
- gpa_matches = re.findall(r"(?:UN.?WEIGHTED|WEIGHTED)\s*GPA\s*([\d.]+)", text, re.IGNORECASE)
374
  if len(gpa_matches) >= 1:
375
  parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
376
  if len(gpa_matches) >= 2:
377
  parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
378
 
379
- # Community service info
380
- service_hours_match = re.search(r"COMM\s*SERV\s*HOURS\s*(\d+)", text, re.IGNORECASE)
381
  if service_hours_match:
382
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
383
-
384
- service_date_match = re.search(r"COMM\s*SERV\s*DATE\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
385
  if service_date_match:
386
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
387
 
388
- # Credits info
389
- credits_match = re.search(r"TOTAL\s*CREDITS\s*EARNED\s*([\d.]+)", text, re.IGNORECASE)
390
  if credits_match:
391
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
392
 
393
- # Virtual grade
394
- virtual_grade_match = re.search(r"VIRTUAL\s*GRADE\s*(\w+)", text, re.IGNORECASE)
395
  if virtual_grade_match:
396
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
397
 
398
- # Extract requirements with improved pattern
399
- req_pattern = re.compile(r"([A-Z]-[^\|]+)\s*\|\s*([^\|]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d.]+)\s*\|\s*([\d]+)\s*%")
400
- for match in req_pattern.finditer(text):
401
- code = match.group(1).strip()
402
- desc = match.group(2).strip()
403
- required = float(match.group(3)) if match.group(3) else 0.0
404
- waived = float(match.group(4)) if match.group(4) else 0.0
405
- completed = float(match.group(5)) if match.group(5) else 0.0
406
- percent = float(match.group(6)) if match.group(6) else 0.0
407
- parsed_data['requirements'][code] = {
408
- "description": desc,
409
- "required": required,
410
- "waived": waived,
411
- "completed": completed,
412
- "percent_complete": percent
413
- }
414
-
415
- # Extract assessments with more flexible pattern
416
- assess_pattern = re.compile(r"Z-([^\|]+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]*)\s*\|\s*([^\|]*)\s*%", re.IGNORECASE)
417
- for match in assess_pattern.finditer(text):
418
- name = f"Assessment: {match.group(1).strip()}"
419
- status = match.group(3).strip() if match.group(3) else ""
420
- if status:
421
- parsed_data['assessments'][name] = status
 
 
 
 
 
422
 
423
- # Handle other Z items
424
- for z_item in ["Community Service Hours", "GPA"]:
425
- z_match = re.search(fr"Z-{z_item.replace(' ', '.*?')}\s*\|\s*(.*?)\s*\|\s*(\w+)\s*\|\s*(\d+)\s*%", text, re.IGNORECASE)
426
- if z_match:
427
- status = z_match.group(2).strip()
428
- parsed_data['assessments'][z_item] = status
 
 
 
 
 
429
 
430
- # Extract course history with more robust pattern
431
- course_history_section = re.search(r"Requirement.*?School Year.*?GradeLv1.*?CrsNum.*?Description.*?Term.*?DstNumber.*?FG.*?Incl.*?Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL | re.IGNORECASE)
432
- if course_history_section:
433
- course_lines = [line.strip() for line in course_history_section.group(1).split('\n') if line.strip() and '|' in line]
434
  for line in course_lines:
435
- parts = [part.strip() for part in line.split('|')]
436
- if len(parts) >= 9:
437
- course = {
438
- 'requirement': parts[0] if len(parts) > 0 else "",
439
- 'school_year': parts[1] if len(parts) > 1 else "",
440
- 'grade_level': parts[2] if len(parts) > 2 else "",
441
- 'course_code': parts[3] if len(parts) > 3 else "",
442
- 'description': parts[4] if len(parts) > 4 else "",
443
- 'term': parts[5] if len(parts) > 5 else "",
444
- 'district_number': parts[6] if len(parts) > 6 else "",
445
- 'fg': parts[7] if len(parts) > 7 else "",
446
- 'included': parts[8] if len(parts) > 8 else "",
447
- 'credits': parts[9] if len(parts) > 9 else "0"
448
- }
449
- # Handle "inProgress" credits
450
- if "inProgress" in course['credits'].lower():
451
- course['credits'] = "0"
452
- parsed_data['course_history'].append(course)
 
453
 
454
  return parsed_data
455
 
 
184
  def preprocess_text(text: str) -> str:
185
  """Normalize text for more reliable parsing"""
186
  text = re.sub(r'\s+', ' ', text) # Normalize whitespace
 
187
  text = text.upper() # Standardize case for certain fields
188
  return text
189
 
 
197
  import pdfplumber
198
  with pdfplumber.open(file_path) as pdf:
199
  for page in pdf.pages:
200
+ # Try to extract tables first
201
+ tables = page.extract_tables({
202
+ "vertical_strategy": "text",
203
+ "horizontal_strategy": "text",
204
+ "intersection_y_tolerance": 10
205
+ })
206
+
207
  if tables:
208
  for table in tables:
209
+ for row in table:
210
+ text += " | ".join(str(cell).strip() for cell in row if cell) + "\n"
211
+
212
+ # Fall back to text extraction if tables are empty
 
213
  page_text = page.extract_text()
214
  if page_text:
215
  text += page_text + "\n"
216
+
217
  if not text.strip():
218
  raise ValueError("PDFPlumber returned empty text")
219
+
220
  except Exception as e:
221
  logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
222
  doc = fitz.open(file_path)
223
  for page in doc:
224
  text += page.get_text("text") + '\n'
225
+
 
 
 
226
  elif file_ext in ['.png', '.jpg', '.jpeg']:
227
  text = extract_text_with_ocr(file_path)
228
 
 
235
 
236
  except Exception as e:
237
  logging.error(f"Text extraction error: {str(e)}")
238
+ raise ValueError(f"Failed to extract text: {str(e)}")
239
 
240
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
241
  try:
 
273
  raise ValueError(f"OCR processing failed: {str(e)}")
274
 
275
  def clean_extracted_text(text: str) -> str:
276
+ """Special cleaning for Miami-Dade transcripts"""
277
+ # Normalize whitespace
278
  text = re.sub(r'\s+', ' ', text).strip()
279
+
280
+ # Fix common OCR errors
281
  replacements = {
282
+ 'GradeLv1': 'GradeLvl',
283
+ 'CrsNu m': 'CrsNum',
284
+ 'YOG': 'Year of Graduation',
285
+ 'Comm Serv': 'Community Service',
286
+ r'\bA\s*-\s*': 'A-', # Fix requirement codes
287
+ r'\bB\s*-\s*': 'B-',
288
+ r'\bC\s*-\s*': 'C-',
289
+ r'\bD\s*-\s*': 'D-',
290
+ r'\bE\s*-\s*': 'E-',
291
+ r'\bF\s*-\s*': 'F-',
292
+ r'\bG\s*-\s*': 'G-',
293
+ r'\bZ\s*-\s*': 'Z-'
294
  }
295
+
296
+ for pattern, replacement in replacements.items():
297
+ text = re.sub(pattern, replacement, text)
298
+
299
+ # Fix course codes with spaces
300
+ text = re.sub(r'(\b[A-Z]{2,4})\s(\d{3}[A-Z]?\b)', r'\1\2', text)
301
+
302
+ # Fix common OCR errors in credits
303
+ text = re.sub(r'in\s*Progress', 'inProgress', text, flags=re.IGNORECASE)
304
+
305
  return text
306
 
307
  def remove_sensitive_info(text: str) -> str:
 
364
  raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
365
 
366
  def _parse_detailed_transcript(self, text: str) -> Optional[Dict]:
367
+ """Parse detailed transcript format with improved patterns for Miami-Dade format"""
368
  try:
369
  parsed_data = {
370
  'student_info': {},
 
373
  'assessments': {}
374
  }
375
 
376
+ # Extract student info
377
  student_info_match = re.search(r"(\d{7}) - (.*?)\n", text)
378
  if student_info_match:
379
  parsed_data['student_info']['id'] = student_info_match.group(1)
380
  parsed_data['student_info']['name'] = student_info_match.group(2).strip()
381
 
382
+ # Extract grade and year info
383
+ grade_match = re.search(r"Current Grade:\s*(\d+)", text)
384
+ if grade_match:
385
+ parsed_data['student_info']['grade'] = grade_match.group(1)
386
+
387
+ yog_match = re.search(r"YOG\s*(\d{4})", text)
388
  if yog_match:
389
  parsed_data['student_info']['year_of_graduation'] = yog_match.group(1)
390
 
391
+ # Extract GPA information
392
+ gpa_matches = re.findall(r"(?:Un-weighted|Weighted)\s*GPA\s*([\d.]+)", text)
393
  if len(gpa_matches) >= 1:
394
  parsed_data['student_info']['unweighted_gpa'] = float(gpa_matches[0])
395
  if len(gpa_matches) >= 2:
396
  parsed_data['student_info']['weighted_gpa'] = float(gpa_matches[1])
397
 
398
+ # Extract community service info
399
+ service_hours_match = re.search(r"Comm\s*Serv\s*Hours\s*(\d+)", text, re.IGNORECASE)
400
  if service_hours_match:
401
  parsed_data['student_info']['community_service_hours'] = int(service_hours_match.group(1))
402
+
403
+ service_date_match = re.search(r"Comm\s*Serv\s*Date\s*(\d{2}/\d{2}/\d{4})", text, re.IGNORECASE)
404
  if service_date_match:
405
  parsed_data['student_info']['community_service_date'] = service_date_match.group(1)
406
 
407
+ # Extract credits info
408
+ credits_match = re.search(r"Total\s*Credits\s*Earned\s*([\d.]+)", text, re.IGNORECASE)
409
  if credits_match:
410
  parsed_data['student_info']['total_credits'] = float(credits_match.group(1))
411
 
412
+ # Extract virtual grade
413
+ virtual_grade_match = re.search(r"Virtual\s*Grade\s*([A-Z])", text, re.IGNORECASE)
414
  if virtual_grade_match:
415
  parsed_data['student_info']['virtual_grade'] = virtual_grade_match.group(1)
416
 
417
+ # Extract requirements - specific to this format
418
+ req_section = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
419
+ if req_section:
420
+ req_lines = [line.strip() for line in req_section.group(1).split('\n') if line.strip()]
421
+ for line in req_lines:
422
+ if '|' in line: # Table format
423
+ parts = [part.strip() for part in line.split('|')]
424
+ if len(parts) >= 6:
425
+ code = parts[0]
426
+ description = parts[1]
427
+ required = float(parts[2]) if parts[2] and parts[2].replace('.','').isdigit() else 0.0
428
+ waived = float(parts[3]) if parts[3] and parts[3].replace('.','').isdigit() else 0.0
429
+ completed = float(parts[4]) if parts[4] and parts[4].replace('.','').isdigit() else 0.0
430
+ status = parts[5]
431
+
432
+ # Extract percentage if available
433
+ percent = 0.0
434
+ percent_match = re.search(r"(\d+)%", status)
435
+ if percent_match:
436
+ percent = float(percent_match.group(1))
437
+
438
+ parsed_data['requirements'][code] = {
439
+ "description": description,
440
+ "required": required,
441
+ "waived": waived,
442
+ "completed": completed,
443
+ "percent_complete": percent,
444
+ "status": status
445
+ }
446
 
447
+ # Extract assessments
448
+ assess_section = re.search(r"Z-Assessment.*?\n(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
449
+ if assess_section:
450
+ assess_lines = [line.strip() for line in assess_section.group(1).split('\n') if line.strip()]
451
+ for line in assess_lines:
452
+ if '|' in line:
453
+ parts = [part.strip() for part in line.split('|')]
454
+ if len(parts) >= 5 and parts[0].startswith('Z-'):
455
+ name = parts[0].replace('Z-', '').strip()
456
+ status = parts[4]
457
+ parsed_data['assessments'][name] = status
458
 
459
+ # Extract course history - specific to this format
460
+ course_section = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNum\s*Description\s*Term\s*DstNumber\s*FG\s*Incl\s*Credits(.*?)(?:\n\s*\n|$)", text, re.DOTALL)
461
+ if course_section:
462
+ course_lines = [line.strip() for line in course_section.group(1).split('\n') if line.strip()]
463
  for line in course_lines:
464
+ if '|' in line:
465
+ parts = [part.strip() for part in line.split('|')]
466
+ if len(parts) >= 9:
467
+ course = {
468
+ 'requirement': parts[0],
469
+ 'school_year': parts[1],
470
+ 'grade_level': parts[2],
471
+ 'course_code': parts[3],
472
+ 'description': parts[4],
473
+ 'term': parts[5],
474
+ 'district_number': parts[6],
475
+ 'fg': parts[7],
476
+ 'included': parts[8],
477
+ 'credits': parts[9] if len(parts) > 9 else "0"
478
+ }
479
+ # Handle inProgress credits
480
+ if "inProgress" in course['credits'].lower():
481
+ course['credits'] = "0"
482
+ parsed_data['course_history'].append(course)
483
 
484
  return parsed_data
485