Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -188,7 +188,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
188 |
for page in doc:
|
189 |
text += page.get_text("text") + '\n'
|
190 |
if not text.strip():
|
191 |
-
|
192 |
text = extract_text_from_pdf_with_ocr(file_path)
|
193 |
|
194 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
@@ -203,30 +203,31 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
|
|
203 |
|
204 |
except Exception as e:
|
205 |
logging.error(f"Text extraction error: {str(e)}")
|
206 |
-
raise gr.Error(f"Failed to extract text: {str(e)}")
|
207 |
|
208 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
209 |
-
text = ""
|
210 |
try:
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
227 |
except Exception as e:
|
228 |
-
raise ValueError(f"
|
229 |
-
return text
|
230 |
|
231 |
def extract_text_with_ocr(file_path: str) -> str:
|
232 |
try:
|
@@ -406,47 +407,56 @@ class TranscriptParser:
|
|
406 |
return None
|
407 |
|
408 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
409 |
-
"""Fallback simplified transcript parser
|
410 |
-
|
411 |
-
'
|
412 |
-
'
|
413 |
-
|
414 |
-
|
415 |
-
# Extract student information
|
416 |
-
name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
|
417 |
-
if name_match:
|
418 |
-
parsed_data['student_info']['name'] = name_match.group(1).strip()
|
419 |
-
|
420 |
-
id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
|
421 |
-
if id_match:
|
422 |
-
parsed_data['student_info']['id'] = id_match.group(1).strip()
|
423 |
-
|
424 |
-
gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
|
425 |
-
if gpa_match:
|
426 |
-
parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
|
427 |
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
'
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
|
439 |
-
|
440 |
|
441 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
442 |
"""Process transcript file and return simple confirmation"""
|
443 |
try:
|
444 |
if not file_obj:
|
445 |
-
raise
|
446 |
|
447 |
validate_file(file_obj)
|
448 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
if progress:
|
451 |
progress(0.2, desc="Extracting text from file...")
|
452 |
|
@@ -476,8 +486,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
|
|
476 |
except Exception as e:
|
477 |
error_msg = f"Error processing transcript: {str(e)}"
|
478 |
logging.error(error_msg)
|
479 |
-
|
480 |
-
return error_msg, None
|
481 |
|
482 |
# ========== LEARNING STYLE QUIZ ==========
|
483 |
class LearningStyleQuiz:
|
@@ -866,11 +875,14 @@ def create_interface():
|
|
866 |
.completed-tab { background: #4CAF50 !important; color: white !important; }
|
867 |
.incomplete-tab { background: #E0E0E0 !important; }
|
868 |
.nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
|
869 |
-
.file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
|
|
|
870 |
.progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
|
871 |
.quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
|
872 |
.quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
|
873 |
.error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
|
|
|
|
|
874 |
|
875 |
.dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
|
876 |
.dark .quiz-question { background-color: #3d3d3d !important; }
|
@@ -927,10 +939,21 @@ def create_interface():
|
|
927 |
transcript_output = gr.Textbox(
|
928 |
label="Analysis Results",
|
929 |
lines=5,
|
930 |
-
interactive=False
|
|
|
931 |
)
|
932 |
transcript_data = gr.State()
|
933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
934 |
upload_btn.click(
|
935 |
fn=parse_transcript,
|
936 |
inputs=[file_input, tab_completed],
|
@@ -1143,20 +1166,22 @@ def create_interface():
|
|
1143 |
if tab_index <= current_tab:
|
1144 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
1145 |
|
1146 |
-
|
1147 |
-
|
1148 |
-
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
|
|
|
|
|
|
1158 |
)
|
1159 |
-
)
|
1160 |
|
1161 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
1162 |
|
|
|
188 |
for page in doc:
|
189 |
text += page.get_text("text") + '\n'
|
190 |
if not text.strip():
|
191 |
+
logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
|
192 |
text = extract_text_from_pdf_with_ocr(file_path)
|
193 |
|
194 |
elif file_ext in ['.png', '.jpg', '.jpeg']:
|
|
|
203 |
|
204 |
except Exception as e:
|
205 |
logging.error(f"Text extraction error: {str(e)}")
|
206 |
+
raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
|
207 |
|
208 |
def extract_text_from_pdf_with_ocr(file_path: str) -> str:
|
|
|
209 |
try:
|
210 |
+
import pdf2image
|
211 |
+
images = pdf2image.convert_from_path(file_path, dpi=300)
|
212 |
+
custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;()-/ '
|
213 |
+
|
214 |
+
text = ""
|
215 |
+
for i, img in enumerate(images):
|
216 |
+
# Pre-process image
|
217 |
+
img = img.convert('L') # Grayscale
|
218 |
+
img = img.point(lambda x: 0 if x < 140 else 255) # Increase contrast
|
219 |
+
|
220 |
+
# OCR with retry logic
|
221 |
+
try:
|
222 |
+
page_text = pytesseract.image_to_string(img, config=custom_config)
|
223 |
+
if len(page_text.strip()) > 20: # Minimum viable text
|
224 |
+
text += f"PAGE {i+1}:\n{page_text}\n\n"
|
225 |
+
except Exception as e:
|
226 |
+
logging.warning(f"OCR failed on page {i+1}: {str(e)}")
|
227 |
+
|
228 |
+
return text if text else "No readable text found"
|
229 |
except Exception as e:
|
230 |
+
raise ValueError(f"OCR processing failed: {str(e)}")
|
|
|
231 |
|
232 |
def extract_text_with_ocr(file_path: str) -> str:
|
233 |
try:
|
|
|
407 |
return None
|
408 |
|
409 |
def _parse_simplified_transcript(self, text: str) -> Dict:
|
410 |
+
"""Fallback simplified transcript parser with multiple pattern attempts"""
|
411 |
+
patterns = [
|
412 |
+
(r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
|
413 |
+
(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
|
414 |
+
(r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
|
415 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
+
for pattern, pattern_type in patterns:
|
418 |
+
try:
|
419 |
+
if pattern_type == 'table':
|
420 |
+
# Parse tabular data
|
421 |
+
courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
|
422 |
+
re.search(pattern, text, re.DOTALL).group(1))
|
423 |
+
elif pattern_type == 'line':
|
424 |
+
courses = re.findall(pattern, text)
|
425 |
+
else:
|
426 |
+
courses = re.findall(pattern, text)
|
427 |
+
|
428 |
+
if courses:
|
429 |
+
parsed_data = {'course_history': []}
|
430 |
+
for course in courses:
|
431 |
+
parsed_data['course_history'].append({
|
432 |
+
'course_code': course[0].strip(),
|
433 |
+
'description': course[1].strip() if len(course) > 1 else '',
|
434 |
+
'grade': course[2].strip() if len(course) > 2 else '',
|
435 |
+
'credits': float(course[3]) if len(course) > 3 else 0.0
|
436 |
+
})
|
437 |
+
return parsed_data
|
438 |
+
except:
|
439 |
+
continue
|
440 |
|
441 |
+
raise ValueError("Could not identify course information in transcript")
|
442 |
|
443 |
def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
444 |
"""Process transcript file and return simple confirmation"""
|
445 |
try:
|
446 |
if not file_obj:
|
447 |
+
raise gr.Error("Please upload a transcript file first (PDF or image)")
|
448 |
|
449 |
validate_file(file_obj)
|
450 |
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
451 |
|
452 |
+
# Additional PDF validation
|
453 |
+
if file_ext == '.pdf':
|
454 |
+
try:
|
455 |
+
with open(file_obj.name, 'rb') as f:
|
456 |
+
PdfReader(f) # Test if PDF is readable
|
457 |
+
except Exception as e:
|
458 |
+
raise gr.Error(f"Invalid PDF file: {str(e)}. Please upload a non-corrupted PDF.")
|
459 |
+
|
460 |
if progress:
|
461 |
progress(0.2, desc="Extracting text from file...")
|
462 |
|
|
|
486 |
except Exception as e:
|
487 |
error_msg = f"Error processing transcript: {str(e)}"
|
488 |
logging.error(error_msg)
|
489 |
+
raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
|
|
|
490 |
|
491 |
# ========== LEARNING STYLE QUIZ ==========
|
492 |
class LearningStyleQuiz:
|
|
|
875 |
.completed-tab { background: #4CAF50 !important; color: white !important; }
|
876 |
.incomplete-tab { background: #E0E0E0 !important; }
|
877 |
.nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
|
878 |
+
.file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; text-align: center; }
|
879 |
+
.file-upload:hover { background: #f5f5f5; }
|
880 |
.progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
|
881 |
.quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
|
882 |
.quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
|
883 |
.error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
|
884 |
+
.transcript-results { border-left: 4px solid #4CAF50 !important; padding: 15px !important; background: #f8f8f8 !important; }
|
885 |
+
.error-box { border: 1px solid #ff4444 !important; background: #fff8f8 !important; }
|
886 |
|
887 |
.dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
|
888 |
.dark .quiz-question { background-color: #3d3d3d !important; }
|
|
|
939 |
transcript_output = gr.Textbox(
|
940 |
label="Analysis Results",
|
941 |
lines=5,
|
942 |
+
interactive=False,
|
943 |
+
elem_classes="transcript-results"
|
944 |
)
|
945 |
transcript_data = gr.State()
|
946 |
|
947 |
+
file_input.change(
|
948 |
+
fn=lambda f: (
|
949 |
+
gr.update(visible=False),
|
950 |
+
gr.update(value="File ready for analysis!", visible=True) if f
|
951 |
+
else gr.update(value="Please upload a file", visible=False)
|
952 |
+
),
|
953 |
+
inputs=file_input,
|
954 |
+
outputs=[file_error, transcript_output]
|
955 |
+
)
|
956 |
+
|
957 |
upload_btn.click(
|
958 |
fn=parse_transcript,
|
959 |
inputs=[file_input, tab_completed],
|
|
|
1166 |
if tab_index <= current_tab:
|
1167 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
1168 |
|
1169 |
+
# Check all previous tabs are completed
|
1170 |
+
for i in range(tab_index):
|
1171 |
+
if not tab_completed_status.get(i, False):
|
1172 |
+
messages = [
|
1173 |
+
"Please complete the transcript analysis first",
|
1174 |
+
"Please complete the learning style quiz first",
|
1175 |
+
"Please fill out your personal information first",
|
1176 |
+
"Please save your profile first"
|
1177 |
+
]
|
1178 |
+
return (
|
1179 |
+
gr.Tabs(selected=i),
|
1180 |
+
gr.update(
|
1181 |
+
value=f"<div class='error-message'>⛔ {messages[i]}</div>",
|
1182 |
+
visible=True
|
1183 |
+
)
|
1184 |
)
|
|
|
1185 |
|
1186 |
return gr.Tabs(selected=tab_index), gr.update(visible=False)
|
1187 |
|