Dannyar608 commited on
Commit
ed548e3
·
verified ·
1 Parent(s): 0d7fd90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -49
app.py CHANGED
@@ -182,7 +182,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
182
  for page in doc:
183
  text += page.get_text("text") + '\n'
184
  if not text.strip():
185
- raise ValueError("PyMuPDF returned empty text")
186
  except Exception as e:
187
  logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
188
  text = extract_text_from_pdf_with_ocr(file_path)
@@ -194,13 +194,13 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
194
  text = clean_extracted_text(text)
195
 
196
  if not text.strip():
197
- raise ValueError("No text could be extracted from the file")
198
 
199
  return text
200
 
201
  except Exception as e:
202
  logging.error(f"Text extraction error: {str(e)}")
203
- raise gr.Error(f"Text extraction error: {str(e)}\nTips: Use high-quality images/PDFs with clear text.")
204
 
205
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
206
  """Fallback PDF text extraction using OCR."""
@@ -215,7 +215,7 @@ def extract_text_from_pdf_with_ocr(file_path: str) -> str:
215
  img = img.point(lambda x: 0 if x < 128 else 255) # Binarize
216
  text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
217
  except Exception as e:
218
- raise ValueError(f"PDF OCR failed: {str(e)}")
219
  return text
220
 
221
  def extract_text_with_ocr(file_path: str) -> str:
@@ -232,7 +232,7 @@ def extract_text_with_ocr(file_path: str) -> str:
232
  text = pytesseract.image_to_string(image, config=custom_config)
233
  return text
234
  except Exception as e:
235
- raise ValueError(f"OCR processing failed: {str(e)}")
236
 
237
  def clean_extracted_text(text: str) -> str:
238
  """Clean and normalize the extracted text."""
@@ -305,8 +305,8 @@ class TranscriptParser:
305
 
306
  except Exception as e:
307
  logging.error(f"Error parsing transcript: {str(e)}")
308
- raise gr.Error(f"Error parsing transcript: {str(e)}")
309
-
310
  def _extract_student_info(self, text: str):
311
  """Enhanced student info extraction for Miami-Dade format"""
312
  # Extract basic student info
@@ -320,6 +320,17 @@ class TranscriptParser:
320
  "current_grade": student_match.group(3),
321
  "graduation_year": student_match.group(4)
322
  }
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  # Extract GPA info
325
  gpa_pattern = r"Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
@@ -330,6 +341,15 @@ class TranscriptParser:
330
  "unweighted_gpa": float(gpa_match.group(1)),
331
  "weighted_gpa": float(gpa_match.group(2))
332
  })
 
 
 
 
 
 
 
 
 
333
 
334
  # Extract credits and service hours
335
  credits_pattern = r"Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)"
@@ -346,7 +366,10 @@ class TranscriptParser:
346
  # Find the requirements table
347
  req_table_start = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status", text)
348
  if not req_table_start:
349
- raise ValueError("Could not find requirements table header")
 
 
 
350
 
351
  req_text = text[req_table_start.start():]
352
 
@@ -389,7 +412,10 @@ class TranscriptParser:
389
  # Find the course history table
390
  course_header = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description\s*Term\s*DstNumber\s*FG\s*Incl\s*Credits", text)
391
  if not course_header:
392
- raise ValueError("Could not find course history table header")
 
 
 
393
 
394
  course_text = text[course_header.start():]
395
 
@@ -615,7 +641,7 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
615
  raise gr.Error("The model ran out of memory. Try with a smaller transcript.")
616
  except Exception as e:
617
  logging.error(f"AI parsing error: {str(e)}")
618
- raise gr.Error(f"Error processing transcript: {str(e)}")
619
 
620
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
621
  """Main function to parse transcript files with better error handling"""
@@ -633,7 +659,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
633
  text = extract_text_from_file(file_obj.name, file_ext)
634
 
635
  if not text.strip():
636
- raise ValueError("No text could be extracted from the file")
637
 
638
  # Use AI for parsing with progress updates
639
  if progress:
@@ -650,15 +676,13 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
650
  return output_text, parsed_data
651
 
652
  except Exception as e:
653
- error_msg = f"Error processing transcript: {str(e)}"
654
- logging.error(error_msg)
655
-
656
- # Provide helpful tips based on error type
657
- if "No text could be extracted" in str(e):
658
- error_msg += "\n\nTips: Please ensure your file is clear and readable. Try scanning at a higher resolution if it's an image."
659
- elif "requirements table header" in str(e):
660
- error_msg += "\n\nTips: This appears to be an unsupported transcript format. Please contact support."
661
 
 
662
  return error_msg, None
663
 
664
  # ========== LEARNING STYLE QUIZ ==========
@@ -773,7 +797,7 @@ class LearningStyleQuiz:
773
  """Evaluate quiz answers and generate enhanced results."""
774
  answers = list(answers) # Convert tuple to list
775
  if len(answers) != len(self.questions):
776
- raise gr.Error("Not all questions were answered")
777
 
778
  scores = {style: 0 for style in self.learning_styles}
779
 
@@ -1309,6 +1333,8 @@ def create_interface():
1309
  .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
1310
  .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
1311
  .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
 
 
1312
  """
1313
 
1314
  # Header
@@ -1347,6 +1373,7 @@ def create_interface():
1347
  type="filepath"
1348
  )
1349
  upload_btn = gr.Button("Analyze Transcript", variant="primary")
 
1350
 
1351
  with gr.Column(scale=2):
1352
  transcript_output = gr.Textbox(
@@ -1358,6 +1385,9 @@ def create_interface():
1358
 
1359
  def process_transcript(file_obj, current_tab_status):
1360
  try:
 
 
 
1361
  output_text, data = parse_transcript(file_obj)
1362
  if "Error" not in output_text:
1363
  new_status = current_tab_status.copy()
@@ -1368,49 +1398,66 @@ def create_interface():
1368
  new_status,
1369
  gr.update(elem_classes="completed-tab"),
1370
  gr.update(interactive=True),
 
 
 
 
 
 
 
 
 
 
 
1371
  gr.update(visible=False)
1372
  )
1373
  except Exception as e:
 
 
 
1374
  return (
1375
- f"Error: {str(e)}",
1376
- None,
1377
  current_tab_status,
1378
  gr.update(),
1379
  gr.update(),
1380
- gr.update(visible=True, value=f"<div class='nav-message'>Error: {str(e)}</div>")
 
1381
  )
1382
 
1383
  upload_btn.click(
1384
  process_transcript,
1385
  inputs=[file_input, tab_completed],
1386
- outputs=[transcript_output, transcript_data, tab_completed, step1, step2, nav_message]
1387
  )
1388
 
1389
  # ===== TAB 2: LEARNING STYLE QUIZ =====
1390
  with gr.Tab("Learning Style Quiz", id=1):
1391
- with gr.Row():
1392
- with gr.Column(scale=1):
1393
- gr.Markdown("### Step 2: Discover Your Learning Style")
1394
- progress = gr.HTML("<div class='progress-bar' style='width: 0%'></div>")
 
 
 
 
 
 
 
 
 
 
 
 
1395
  quiz_submit = gr.Button("Submit Quiz", variant="primary")
1396
- quiz_alert = gr.HTML(visible=False)
1397
 
1398
- with gr.Column(scale=2):
1399
- quiz_components = []
1400
- with gr.Accordion("Quiz Questions", open=True):
1401
- for i, (question, options) in enumerate(zip(learning_style_quiz.questions, learning_style_quiz.options)):
1402
- with gr.Group(elem_classes="quiz-question"):
1403
- q = gr.Radio(
1404
- options,
1405
- label=f"{i+1}. {question}",
1406
- show_label=True
1407
- )
1408
- quiz_components.append(q)
1409
-
1410
- learning_output = gr.Markdown(
1411
- label="Your Learning Style Results",
1412
- visible=False
1413
- )
1414
 
1415
  # Update progress bar as questions are answered
1416
  for component in quiz_components:
@@ -1448,7 +1495,7 @@ def create_interface():
1448
  current_tab_status,
1449
  gr.update(),
1450
  gr.update(),
1451
- gr.update(value=f"<div class='nav-message'>Error: {str(e)}</div>", visible=True),
1452
  gr.update(visible=False)
1453
  )
1454
 
@@ -1457,6 +1504,14 @@ def create_interface():
1457
  inputs=[tab_completed] + quiz_components,
1458
  outputs=[learning_output, learning_output, tab_completed, step2, step3, quiz_alert, nav_message]
1459
  )
 
 
 
 
 
 
 
 
1460
 
1461
  # ===== TAB 3: PERSONAL QUESTIONS =====
1462
  with gr.Tab("Personal Profile", id=2):
@@ -1507,7 +1562,7 @@ def create_interface():
1507
  gr.update(),
1508
  gr.update(),
1509
  gr.update(visible=False),
1510
- gr.update(visible=True, value=f"<div class='nav-message'>Error: {str(e)}</div>")
1511
  )
1512
 
1513
  save_personal_btn.click(
@@ -1566,7 +1621,7 @@ def create_interface():
1566
  current_tab_status,
1567
  gr.update(),
1568
  gr.update(),
1569
- gr.update(visible=True, value=f"<div class='nav-message'>Error: {str(e)}</div>")
1570
  )
1571
 
1572
  save_btn.click(
@@ -1654,7 +1709,7 @@ def create_interface():
1654
  if not tab_completed_status.get(current_tab, False):
1655
  return (
1656
  gr.Tabs(selected=current_tab),
1657
- gr.update(value=f"⚠️ Complete Step {current_tab+1} first!", visible=True)
1658
  )
1659
 
1660
  return gr.Tabs(selected=tab_index), gr.update(visible=False)
 
182
  for page in doc:
183
  text += page.get_text("text") + '\n'
184
  if not text.strip():
185
+ raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
186
  except Exception as e:
187
  logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
188
  text = extract_text_from_pdf_with_ocr(file_path)
 
194
  text = clean_extracted_text(text)
195
 
196
  if not text.strip():
197
+ raise ValueError("No text could be extracted from the file. Please ensure the file is clear and readable.")
198
 
199
  return text
200
 
201
  except Exception as e:
202
  logging.error(f"Text extraction error: {str(e)}")
203
+ raise gr.Error(f"Failed to extract text: {str(e)}\n\nTIPS:\n1. For PDFs, try saving as a different PDF format\n2. For images, ensure they are high-quality and well-lit\n3. Try cropping to just the transcript area")
204
 
205
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
206
  """Fallback PDF text extraction using OCR."""
 
215
  img = img.point(lambda x: 0 if x < 128 else 255) # Binarize
216
  text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
217
  except Exception as e:
218
+ raise ValueError(f"PDF OCR failed: {str(e)}. The PDF may be password protected or corrupted.")
219
  return text
220
 
221
  def extract_text_with_ocr(file_path: str) -> str:
 
232
  text = pytesseract.image_to_string(image, config=custom_config)
233
  return text
234
  except Exception as e:
235
+ raise ValueError(f"OCR processing failed: {str(e)}. Please ensure the image is clear and not blurry.")
236
 
237
  def clean_extracted_text(text: str) -> str:
238
  """Clean and normalize the extracted text."""
 
305
 
306
  except Exception as e:
307
  logging.error(f"Error parsing transcript: {str(e)}")
308
+ raise gr.Error(f"Error parsing transcript: {str(e)}\n\nThis may be due to an unsupported transcript format. Please ensure you're uploading an official Miami-Dade transcript or contact support.")
309
+
310
  def _extract_student_info(self, text: str):
311
  """Enhanced student info extraction for Miami-Dade format"""
312
  # Extract basic student info
 
320
  "current_grade": student_match.group(3),
321
  "graduation_year": student_match.group(4)
322
  }
323
+ else:
324
+ # Fallback pattern for alternative formats
325
+ fallback_pattern = r"Student:\s*([^\n]+)\s*ID:\s*(\d+)\s*Grade:\s*(\d+)"
326
+ fallback_match = re.search(fallback_pattern, text, re.IGNORECASE)
327
+ if fallback_match:
328
+ self.student_data = {
329
+ "name": fallback_match.group(1).strip(),
330
+ "id": fallback_match.group(2),
331
+ "current_grade": fallback_match.group(3),
332
+ "graduation_year": "Unknown"
333
+ }
334
 
335
  # Extract GPA info
336
  gpa_pattern = r"Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
 
341
  "unweighted_gpa": float(gpa_match.group(1)),
342
  "weighted_gpa": float(gpa_match.group(2))
343
  })
344
+ else:
345
+ # Try alternative GPA patterns
346
+ alt_gpa_pattern = r"GPA\s*([\d.]+)\s*/\s*([\d.]+)"
347
+ alt_match = re.search(alt_gpa_pattern, text)
348
+ if alt_match:
349
+ self.student_data.update({
350
+ "unweighted_gpa": float(alt_match.group(1)),
351
+ "weighted_gpa": float(alt_match.group(2))
352
+ })
353
 
354
  # Extract credits and service hours
355
  credits_pattern = r"Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)"
 
366
  # Find the requirements table
367
  req_table_start = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status", text)
368
  if not req_table_start:
369
+ # Try alternative table headers
370
+ req_table_start = re.search(r"Requirement\s*Req\s*Comp\s*Status", text)
371
+ if not req_table_start:
372
+ raise ValueError("Could not find requirements table header")
373
 
374
  req_text = text[req_table_start.start():]
375
 
 
412
  # Find the course history table
413
  course_header = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description\s*Term\s*DstNumber\s*FG\s*Incl\s*Credits", text)
414
  if not course_header:
415
+ # Try alternative course history headers
416
+ course_header = re.search(r"Course\s*Grade\s*Credit\s*Year", text)
417
+ if not course_header:
418
+ raise ValueError("Could not find course history table header")
419
 
420
  course_text = text[course_header.start():]
421
 
 
641
  raise gr.Error("The model ran out of memory. Try with a smaller transcript.")
642
  except Exception as e:
643
  logging.error(f"AI parsing error: {str(e)}")
644
+ raise gr.Error(f"Error processing transcript: {str(e)}\n\nPlease try again or contact support with this error message.")
645
 
646
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
647
  """Main function to parse transcript files with better error handling"""
 
659
  text = extract_text_from_file(file_obj.name, file_ext)
660
 
661
  if not text.strip():
662
+ raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
663
 
664
  # Use AI for parsing with progress updates
665
  if progress:
 
676
  return output_text, parsed_data
677
 
678
  except Exception as e:
679
+ error_msg = f"Error processing transcript: {str(e)}"
680
+ if "PDF" in str(e):
681
+ error_msg += "\n\nTIPS FOR PDF FILES:\n1. Try opening and re-saving the PDF in a different format\n2. Ensure the PDF isn't password protected\n3. Try taking a screenshot of the transcript and uploading as an image"
682
+ elif "image" in str(e).lower():
683
+ error_msg += "\n\nTIPS FOR IMAGE FILES:\n1. Ensure the image is clear and well-lit\n2. Try cropping to just the transcript area\n3. Avoid blurry or low-resolution images"
 
 
 
684
 
685
+ logging.error(error_msg)
686
  return error_msg, None
687
 
688
  # ========== LEARNING STYLE QUIZ ==========
 
797
  """Evaluate quiz answers and generate enhanced results."""
798
  answers = list(answers) # Convert tuple to list
799
  if len(answers) != len(self.questions):
800
+ raise gr.Error("Please answer all questions before submitting")
801
 
802
  scores = {style: 0 for style in self.learning_styles}
803
 
 
1333
  .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
1334
  .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
1335
  .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
1336
+ .quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
1337
+ .error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
1338
  """
1339
 
1340
  # Header
 
1373
  type="filepath"
1374
  )
1375
  upload_btn = gr.Button("Analyze Transcript", variant="primary")
1376
+ file_error = gr.HTML(visible=False)
1377
 
1378
  with gr.Column(scale=2):
1379
  transcript_output = gr.Textbox(
 
1385
 
1386
  def process_transcript(file_obj, current_tab_status):
1387
  try:
1388
+ if not file_obj:
1389
+ raise ValueError("Please upload a file first")
1390
+
1391
  output_text, data = parse_transcript(file_obj)
1392
  if "Error" not in output_text:
1393
  new_status = current_tab_status.copy()
 
1398
  new_status,
1399
  gr.update(elem_classes="completed-tab"),
1400
  gr.update(interactive=True),
1401
+ gr.update(visible=False),
1402
+ gr.update(visible=False)
1403
+ )
1404
+ else:
1405
+ return (
1406
+ output_text,
1407
+ None,
1408
+ current_tab_status,
1409
+ gr.update(),
1410
+ gr.update(),
1411
+ gr.update(visible=True, value=f"<div class='error-message'>{output_text}</div>"),
1412
  gr.update(visible=False)
1413
  )
1414
  except Exception as e:
1415
+ error_msg = f"❌ Error: {str(e)}"
1416
+ if "PDF" in str(e):
1417
+ error_msg += "\n\nTIPS FOR PDF FILES:\n1. Try opening and re-saving the PDF\n2. Ensure it's not password protected\n3. Try converting to an image"
1418
  return (
1419
+ error_msg,
1420
+ None,
1421
  current_tab_status,
1422
  gr.update(),
1423
  gr.update(),
1424
+ gr.update(visible=True, value=f"<div class='error-message'>{error_msg}</div>"),
1425
+ gr.update(visible=False)
1426
  )
1427
 
1428
  upload_btn.click(
1429
  process_transcript,
1430
  inputs=[file_input, tab_completed],
1431
+ outputs=[transcript_output, transcript_data, tab_completed, step1, step2, file_error, nav_message]
1432
  )
1433
 
1434
  # ===== TAB 2: LEARNING STYLE QUIZ =====
1435
  with gr.Tab("Learning Style Quiz", id=1):
1436
+ with gr.Column():
1437
+ gr.Markdown("### Step 2: Discover Your Learning Style")
1438
+ progress = gr.HTML("<div class='progress-bar' style='width: 0%'></div>")
1439
+
1440
+ quiz_components = []
1441
+ with gr.Accordion("Quiz Questions", open=True):
1442
+ for i, (question, options) in enumerate(zip(learning_style_quiz.questions, learning_style_quiz.options)):
1443
+ with gr.Group(elem_classes="quiz-question"):
1444
+ q = gr.Radio(
1445
+ options,
1446
+ label=f"{i+1}. {question}",
1447
+ show_label=True
1448
+ )
1449
+ quiz_components.append(q)
1450
+
1451
+ with gr.Row():
1452
  quiz_submit = gr.Button("Submit Quiz", variant="primary")
1453
+ quiz_clear = gr.Button("Clear Answers")
1454
 
1455
+ quiz_alert = gr.HTML(visible=False)
1456
+ learning_output = gr.Markdown(
1457
+ label="Your Learning Style Results",
1458
+ visible=False,
1459
+ elem_classes="quiz-results"
1460
+ )
 
 
 
 
 
 
 
 
 
 
1461
 
1462
  # Update progress bar as questions are answered
1463
  for component in quiz_components:
 
1495
  current_tab_status,
1496
  gr.update(),
1497
  gr.update(),
1498
+ gr.update(value=f"<div class='error-message'>Error: {str(e)}</div>", visible=True),
1499
  gr.update(visible=False)
1500
  )
1501
 
 
1504
  inputs=[tab_completed] + quiz_components,
1505
  outputs=[learning_output, learning_output, tab_completed, step2, step3, quiz_alert, nav_message]
1506
  )
1507
+
1508
+ quiz_clear.click(
1509
+ fn=lambda: [None] * len(quiz_components),
1510
+ outputs=quiz_components
1511
+ ).then(
1512
+ fn=lambda: gr.HTML("<div class='progress-bar' style='width: 0%'></div>"),
1513
+ outputs=progress
1514
+ )
1515
 
1516
  # ===== TAB 3: PERSONAL QUESTIONS =====
1517
  with gr.Tab("Personal Profile", id=2):
 
1562
  gr.update(),
1563
  gr.update(),
1564
  gr.update(visible=False),
1565
+ gr.update(visible=True, value=f"<div class='error-message'>Error: {str(e)}</div>")
1566
  )
1567
 
1568
  save_personal_btn.click(
 
1621
  current_tab_status,
1622
  gr.update(),
1623
  gr.update(),
1624
+ gr.update(visible=True, value=f"<div class='error-message'>Error: {str(e)}</div>")
1625
  )
1626
 
1627
  save_btn.click(
 
1709
  if not tab_completed_status.get(current_tab, False):
1710
  return (
1711
  gr.Tabs(selected=current_tab),
1712
+ gr.update(value=f"<div class='error-message'>⚠️ Please complete Step {current_tab+1} first!</div>", visible=True)
1713
  )
1714
 
1715
  return gr.Tabs(selected=tab_index), gr.update(visible=False)