Dannyar608 commited on
Commit
f8e1794
·
verified ·
1 Parent(s): a0e5ea9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -68
app.py CHANGED
@@ -188,7 +188,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
188
  for page in doc:
189
  text += page.get_text("text") + '\n'
190
  if not text.strip():
191
- raise ValueError("PyMuPDF returned empty text, trying OCR fallback...")
192
  text = extract_text_from_pdf_with_ocr(file_path)
193
 
194
  elif file_ext in ['.png', '.jpg', '.jpeg']:
@@ -203,30 +203,31 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
203
 
204
  except Exception as e:
205
  logging.error(f"Text extraction error: {str(e)}")
206
- raise gr.Error(f"Failed to extract text: {str(e)}")
207
 
208
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
209
- text = ""
210
  try:
211
- # Try pdf2image first if available
212
- try:
213
- from pdf2image import convert_from_path
214
- images = convert_from_path(file_path)
215
- for img in images:
216
- text += pytesseract.image_to_string(img) + '\n'
217
- return text
218
- except ImportError:
219
- # Fallback to PyMuPDF for image extraction
220
- doc = fitz.open(file_path)
221
- for page in doc:
222
- pix = page.get_pixmap()
223
- img = Image.open(io.BytesIO(pix.tobytes()))
224
- img = img.convert('L')
225
- img = img.point(lambda x: 0 if x < 128 else 255)
226
- text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
 
 
 
227
  except Exception as e:
228
- raise ValueError(f"PDF OCR failed: {str(e)}")
229
- return text
230
 
231
  def extract_text_with_ocr(file_path: str) -> str:
232
  try:
@@ -406,47 +407,56 @@ class TranscriptParser:
406
  return None
407
 
408
  def _parse_simplified_transcript(self, text: str) -> Dict:
409
- """Fallback simplified transcript parser that extracts key information"""
410
- parsed_data = {
411
- 'student_info': {},
412
- 'course_history': []
413
- }
414
-
415
- # Extract student information
416
- name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
417
- if name_match:
418
- parsed_data['student_info']['name'] = name_match.group(1).strip()
419
-
420
- id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
421
- if id_match:
422
- parsed_data['student_info']['id'] = id_match.group(1).strip()
423
-
424
- gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
425
- if gpa_match:
426
- parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
427
 
428
- # Extract courses (simplified pattern)
429
- course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
430
- courses = re.findall(course_pattern, text)
431
- for course in courses:
432
- parsed_data['course_history'].append({
433
- 'course_code': course[0],
434
- 'description': course[1],
435
- 'grade': course[2],
436
- 'credits': float(course[3])
437
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
438
 
439
- return parsed_data
440
 
441
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
442
  """Process transcript file and return simple confirmation"""
443
  try:
444
  if not file_obj:
445
- raise ValueError("Please upload a file first")
446
 
447
  validate_file(file_obj)
448
  file_ext = os.path.splitext(file_obj.name)[1].lower()
449
 
 
 
 
 
 
 
 
 
450
  if progress:
451
  progress(0.2, desc="Extracting text from file...")
452
 
@@ -476,8 +486,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
476
  except Exception as e:
477
  error_msg = f"Error processing transcript: {str(e)}"
478
  logging.error(error_msg)
479
- # Return more detailed error to user
480
- return error_msg, None
481
 
482
  # ========== LEARNING STYLE QUIZ ==========
483
  class LearningStyleQuiz:
@@ -866,11 +875,14 @@ def create_interface():
866
  .completed-tab { background: #4CAF50 !important; color: white !important; }
867
  .incomplete-tab { background: #E0E0E0 !important; }
868
  .nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
869
- .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
 
870
  .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
871
  .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
872
  .quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
873
  .error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
 
 
874
 
875
  .dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
876
  .dark .quiz-question { background-color: #3d3d3d !important; }
@@ -927,10 +939,21 @@ def create_interface():
927
  transcript_output = gr.Textbox(
928
  label="Analysis Results",
929
  lines=5,
930
- interactive=False
 
931
  )
932
  transcript_data = gr.State()
933
 
 
 
 
 
 
 
 
 
 
 
934
  upload_btn.click(
935
  fn=parse_transcript,
936
  inputs=[file_input, tab_completed],
@@ -1143,20 +1166,22 @@ def create_interface():
1143
  if tab_index <= current_tab:
1144
  return gr.Tabs(selected=tab_index), gr.update(visible=False)
1145
 
1146
- if not tab_completed_status.get(current_tab, False):
1147
- messages = {
1148
- 0: "Please complete the transcript analysis first.",
1149
- 1: "Please complete the learning style quiz first.",
1150
- 2: "Please fill out your personal information first.",
1151
- 3: "Please save your profile first."
1152
- }
1153
- return (
1154
- gr.Tabs(selected=current_tab),
1155
- gr.update(
1156
- value=f"<div class='error-message'>⚠️ {messages.get(current_tab, 'Please complete this step first')}</div>",
1157
- visible=True
 
 
 
1158
  )
1159
- )
1160
 
1161
  return gr.Tabs(selected=tab_index), gr.update(visible=False)
1162
 
 
188
  for page in doc:
189
  text += page.get_text("text") + '\n'
190
  if not text.strip():
191
+ logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
192
  text = extract_text_from_pdf_with_ocr(file_path)
193
 
194
  elif file_ext in ['.png', '.jpg', '.jpeg']:
 
203
 
204
  except Exception as e:
205
  logging.error(f"Text extraction error: {str(e)}")
206
+ raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
207
 
208
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
 
209
  try:
210
+ import pdf2image
211
+ images = pdf2image.convert_from_path(file_path, dpi=300)
212
+ custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;()-/ '
213
+
214
+ text = ""
215
+ for i, img in enumerate(images):
216
+ # Pre-process image
217
+ img = img.convert('L') # Grayscale
218
+ img = img.point(lambda x: 0 if x < 140 else 255) # Increase contrast
219
+
220
+ # OCR with retry logic
221
+ try:
222
+ page_text = pytesseract.image_to_string(img, config=custom_config)
223
+ if len(page_text.strip()) > 20: # Minimum viable text
224
+ text += f"PAGE {i+1}:\n{page_text}\n\n"
225
+ except Exception as e:
226
+ logging.warning(f"OCR failed on page {i+1}: {str(e)}")
227
+
228
+ return text if text else "No readable text found"
229
  except Exception as e:
230
+ raise ValueError(f"OCR processing failed: {str(e)}")
 
231
 
232
  def extract_text_with_ocr(file_path: str) -> str:
233
  try:
 
407
  return None
408
 
409
  def _parse_simplified_transcript(self, text: str) -> Dict:
410
+ """Fallback simplified transcript parser with multiple pattern attempts"""
411
+ patterns = [
412
+ (r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
413
+ (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
414
+ (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
415
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
+ for pattern, pattern_type in patterns:
418
+ try:
419
+ if pattern_type == 'table':
420
+ # Parse tabular data
421
+ courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
422
+ re.search(pattern, text, re.DOTALL).group(1))
423
+ elif pattern_type == 'line':
424
+ courses = re.findall(pattern, text)
425
+ else:
426
+ courses = re.findall(pattern, text)
427
+
428
+ if courses:
429
+ parsed_data = {'course_history': []}
430
+ for course in courses:
431
+ parsed_data['course_history'].append({
432
+ 'course_code': course[0].strip(),
433
+ 'description': course[1].strip() if len(course) > 1 else '',
434
+ 'grade': course[2].strip() if len(course) > 2 else '',
435
+ 'credits': float(course[3]) if len(course) > 3 else 0.0
436
+ })
437
+ return parsed_data
438
+ except:
439
+ continue
440
 
441
+ raise ValueError("Could not identify course information in transcript")
442
 
443
  def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
444
  """Process transcript file and return simple confirmation"""
445
  try:
446
  if not file_obj:
447
+ raise gr.Error("Please upload a transcript file first (PDF or image)")
448
 
449
  validate_file(file_obj)
450
  file_ext = os.path.splitext(file_obj.name)[1].lower()
451
 
452
+ # Additional PDF validation
453
+ if file_ext == '.pdf':
454
+ try:
455
+ with open(file_obj.name, 'rb') as f:
456
+ PdfReader(f) # Test if PDF is readable
457
+ except Exception as e:
458
+ raise gr.Error(f"Invalid PDF file: {str(e)}. Please upload a non-corrupted PDF.")
459
+
460
  if progress:
461
  progress(0.2, desc="Extracting text from file...")
462
 
 
486
  except Exception as e:
487
  error_msg = f"Error processing transcript: {str(e)}"
488
  logging.error(error_msg)
489
+ raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
 
490
 
491
  # ========== LEARNING STYLE QUIZ ==========
492
  class LearningStyleQuiz:
 
875
  .completed-tab { background: #4CAF50 !important; color: white !important; }
876
  .incomplete-tab { background: #E0E0E0 !important; }
877
  .nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
878
+ .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; text-align: center; }
879
+ .file-upload:hover { background: #f5f5f5; }
880
  .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
881
  .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
882
  .quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
883
  .error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
884
+ .transcript-results { border-left: 4px solid #4CAF50 !important; padding: 15px !important; background: #f8f8f8 !important; }
885
+ .error-box { border: 1px solid #ff4444 !important; background: #fff8f8 !important; }
886
 
887
  .dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
888
  .dark .quiz-question { background-color: #3d3d3d !important; }
 
939
  transcript_output = gr.Textbox(
940
  label="Analysis Results",
941
  lines=5,
942
+ interactive=False,
943
+ elem_classes="transcript-results"
944
  )
945
  transcript_data = gr.State()
946
 
947
+ file_input.change(
948
+ fn=lambda f: (
949
+ gr.update(visible=False),
950
+ gr.update(value="File ready for analysis!", visible=True) if f
951
+ else gr.update(value="Please upload a file", visible=False)
952
+ ),
953
+ inputs=file_input,
954
+ outputs=[file_error, transcript_output]
955
+ )
956
+
957
  upload_btn.click(
958
  fn=parse_transcript,
959
  inputs=[file_input, tab_completed],
 
1166
  if tab_index <= current_tab:
1167
  return gr.Tabs(selected=tab_index), gr.update(visible=False)
1168
 
1169
+ # Check all previous tabs are completed
1170
+ for i in range(tab_index):
1171
+ if not tab_completed_status.get(i, False):
1172
+ messages = [
1173
+ "Please complete the transcript analysis first",
1174
+ "Please complete the learning style quiz first",
1175
+ "Please fill out your personal information first",
1176
+ "Please save your profile first"
1177
+ ]
1178
+ return (
1179
+ gr.Tabs(selected=i),
1180
+ gr.update(
1181
+ value=f"<div class='error-message'>⛔ {messages[i]}</div>",
1182
+ visible=True
1183
+ )
1184
  )
 
1185
 
1186
  return gr.Tabs(selected=tab_index), gr.update(visible=False)
1187