Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 18

Commit

f8e1794

verified ·

1 Parent(s): a0e5ea9

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -68

app.py CHANGED Viewed

@@ -188,7 +188,7 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
-                    raise ValueError("PyMuPDF returned empty text, trying OCR fallback...")
                     text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
@@ -203,30 +203,31 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
     except Exception as e:
         logging.error(f"Text extraction error: {str(e)}")
-        raise gr.Error(f"Failed to extract text: {str(e)}")
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
-    text = ""
     try:
-        # Try pdf2image first if available
-        try:
-            from pdf2image import convert_from_path
-            images = convert_from_path(file_path)
-            for img in images:
-                text += pytesseract.image_to_string(img) + '\n'
-            return text
-        except ImportError:
-            # Fallback to PyMuPDF for image extraction
-            doc = fitz.open(file_path)
-            for page in doc:
-                pix = page.get_pixmap()
-                img = Image.open(io.BytesIO(pix.tobytes()))
-                img = img.convert('L')
-                img = img.point(lambda x: 0 if x < 128 else 255)
-                text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
     except Exception as e:
-        raise ValueError(f"PDF OCR failed: {str(e)}")
-    return text
 def extract_text_with_ocr(file_path: str) -> str:
     try:
@@ -406,47 +407,56 @@ class TranscriptParser:
             return None
     def _parse_simplified_transcript(self, text: str) -> Dict:
-        """Fallback simplified transcript parser that extracts key information"""
-        parsed_data = {
-            'student_info': {},
-            'course_history': []
-        }
-        # Extract student information
-        name_match = re.search(r'(?:Name|Student)[:\s]+([A-Za-z,\s]+)', text, re.IGNORECASE)
-        if name_match:
-            parsed_data['student_info']['name'] = name_match.group(1).strip()
-        id_match = re.search(r'(?:ID|Student\s*ID)[:\s]+([A-Za-z0-9-]+)', text, re.IGNORECASE)
-        if id_match:
-            parsed_data['student_info']['id'] = id_match.group(1).strip()
-        gpa_match = re.search(r'(?:GPA|Grade\s*Point\s*Average)[:\s]+([0-9.]+)', text, re.IGNORECASE)
-        if gpa_match:
-            parsed_data['student_info']['gpa'] = float(gpa_match.group(1))
-        # Extract courses (simplified pattern)
-        course_pattern = r'([A-Z]{2,4}\s?\d{3})\s+(.*?)\s+([A-F][+-]?)\s+([0-9.]+)'
-        courses = re.findall(course_pattern, text)
-        for course in courses:
-            parsed_data['course_history'].append({
-                'course_code': course[0],
-                'description': course[1],
-                'grade': course[2],
-                'credits': float(course[3])
-            })
-        return parsed_data
 def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
     """Process transcript file and return simple confirmation"""
     try:
         if not file_obj:
-            raise ValueError("Please upload a file first")
         validate_file(file_obj)
         file_ext = os.path.splitext(file_obj.name)[1].lower()
         if progress:
             progress(0.2, desc="Extracting text from file...")
@@ -476,8 +486,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
     except Exception as e:
         error_msg = f"Error processing transcript: {str(e)}"
         logging.error(error_msg)
-        # Return more detailed error to user
-        return error_msg, None
 # ========== LEARNING STYLE QUIZ ==========
 class LearningStyleQuiz:
@@ -866,11 +875,14 @@ def create_interface():
         .completed-tab { background: #4CAF50 !important; color: white !important; }
         .incomplete-tab { background: #E0E0E0 !important; }
         .nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
-        .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; }
         .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
         .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
         .quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
         .error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
         .dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
         .dark .quiz-question { background-color: #3d3d3d !important; }
@@ -927,10 +939,21 @@ def create_interface():
                         transcript_output = gr.Textbox(
                             label="Analysis Results",
                             lines=5,
-                            interactive=False
                         )
                         transcript_data = gr.State()
                 upload_btn.click(
                     fn=parse_transcript,
                     inputs=[file_input, tab_completed],
@@ -1143,20 +1166,22 @@ def create_interface():
             if tab_index <= current_tab:
                 return gr.Tabs(selected=tab_index), gr.update(visible=False)
-            if not tab_completed_status.get(current_tab, False):
-                messages = {
-                    0: "Please complete the transcript analysis first.",
-                    1: "Please complete the learning style quiz first.",
-                    2: "Please fill out your personal information first.",
-                    3: "Please save your profile first."
-                }
-                return (
-                    gr.Tabs(selected=current_tab),
-                    gr.update(
-                        value=f"<div class='error-message'>⚠️ {messages.get(current_tab, 'Please complete this step first')}</div>",
-                        visible=True
                     )
-                )
             return gr.Tabs(selected=tab_index), gr.update(visible=False)

                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
+                    logging.warning("PyMuPDF returned empty text, trying OCR fallback...")
                     text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
     except Exception as e:
         logging.error(f"Text extraction error: {str(e)}")
+        raise gr.Error(f"Failed to extract text: {str(e)}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
     try:
+        import pdf2image
+        images = pdf2image.convert_from_path(file_path, dpi=300)
+        custom_config = r'--oem 3 --psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;()-/ '
+        text = ""
+        for i, img in enumerate(images):
+            # Pre-process image
+            img = img.convert('L')  # Grayscale
+            img = img.point(lambda x: 0 if x < 140 else 255)  # Increase contrast
+            # OCR with retry logic
+            try:
+                page_text = pytesseract.image_to_string(img, config=custom_config)
+                if len(page_text.strip()) > 20:  # Minimum viable text
+                    text += f"PAGE {i+1}:\n{page_text}\n\n"
+            except Exception as e:
+                logging.warning(f"OCR failed on page {i+1}: {str(e)}")
+        return text if text else "No readable text found"
     except Exception as e:
+        raise ValueError(f"OCR processing failed: {str(e)}")
 def extract_text_with_ocr(file_path: str) -> str:
     try:
             return None
     def _parse_simplified_transcript(self, text: str) -> Dict:
+        """Fallback simplified transcript parser with multiple pattern attempts"""
+        patterns = [
+            (r'(?:Course|Subject)\s*Code.*?Grade.*?Credits(.*?)(?:\n\s*\n|\Z)', 'table'),
+            (r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'line'),
+            (r'(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)', 'minimal')
+        ]
+        for pattern, pattern_type in patterns:
+            try:
+                if pattern_type == 'table':
+                    # Parse tabular data
+                    courses = re.findall(r'([A-Z]{2,4}\s?\d{3}[A-Z]?)\s+(.*?)\s+([A-F][+-]?)\s+(\d+\.?\d*)',
+                                       re.search(pattern, text, re.DOTALL).group(1))
+                elif pattern_type == 'line':
+                    courses = re.findall(pattern, text)
+                else:
+                    courses = re.findall(pattern, text)
+                if courses:
+                    parsed_data = {'course_history': []}
+                    for course in courses:
+                        parsed_data['course_history'].append({
+                            'course_code': course[0].strip(),
+                            'description': course[1].strip() if len(course) > 1 else '',
+                            'grade': course[2].strip() if len(course) > 2 else '',
+                            'credits': float(course[3]) if len(course) > 3 else 0.0
+                        })
+                    return parsed_data
+            except:
+                continue
+        raise ValueError("Could not identify course information in transcript")
 def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
     """Process transcript file and return simple confirmation"""
     try:
         if not file_obj:
+            raise gr.Error("Please upload a transcript file first (PDF or image)")
         validate_file(file_obj)
         file_ext = os.path.splitext(file_obj.name)[1].lower()
+        # Additional PDF validation
+        if file_ext == '.pdf':
+            try:
+                with open(file_obj.name, 'rb') as f:
+                    PdfReader(f)  # Test if PDF is readable
+            except Exception as e:
+                raise gr.Error(f"Invalid PDF file: {str(e)}. Please upload a non-corrupted PDF.")
         if progress:
             progress(0.2, desc="Extracting text from file...")
     except Exception as e:
         error_msg = f"Error processing transcript: {str(e)}"
         logging.error(error_msg)
+        raise gr.Error(f"{error_msg}\n\nPossible solutions:\n1. Try a different file format\n2. Ensure text is clear and not handwritten\n3. Check file size (<5MB)")
 # ========== LEARNING STYLE QUIZ ==========
 class LearningStyleQuiz:
         .completed-tab { background: #4CAF50 !important; color: white !important; }
         .incomplete-tab { background: #E0E0E0 !important; }
         .nav-message { padding: 10px; margin: 10px 0; border-radius: 4px; background-color: #ffebee; color: #c62828; }
+        .file-upload { border: 2px dashed #4CAF50 !important; padding: 20px !important; border-radius: 8px !important; text-align: center; }
+        .file-upload:hover { background: #f5f5f5; }
         .progress-bar { height: 5px; background: linear-gradient(to right, #4CAF50, #8BC34A); margin-bottom: 15px; border-radius: 3px; }
         .quiz-question { margin-bottom: 15px; padding: 15px; background: #f5f5f5; border-radius: 5px; }
         .quiz-results { margin-top: 20px; padding: 20px; background: #e8f5e9; border-radius: 8px; }
         .error-message { color: #d32f2f; background-color: #ffebee; padding: 10px; border-radius: 4px; margin: 10px 0; }
+        .transcript-results { border-left: 4px solid #4CAF50 !important; padding: 15px !important; background: #f8f8f8 !important; }
+        .error-box { border: 1px solid #ff4444 !important; background: #fff8f8 !important; }
         .dark .tab-content { background-color: #2d2d2d !important; border-color: #444 !important; }
         .dark .quiz-question { background-color: #3d3d3d !important; }
                         transcript_output = gr.Textbox(
                             label="Analysis Results",
                             lines=5,
+                            interactive=False,
+                            elem_classes="transcript-results"
                         )
                         transcript_data = gr.State()
+                file_input.change(
+                    fn=lambda f: (
+                        gr.update(visible=False),
+                        gr.update(value="File ready for analysis!", visible=True) if f
+                        else gr.update(value="Please upload a file", visible=False)
+                    ),
+                    inputs=file_input,
+                    outputs=[file_error, transcript_output]
+                )
                 upload_btn.click(
                     fn=parse_transcript,
                     inputs=[file_input, tab_completed],
             if tab_index <= current_tab:
                 return gr.Tabs(selected=tab_index), gr.update(visible=False)
+            # Check all previous tabs are completed
+            for i in range(tab_index):
+                if not tab_completed_status.get(i, False):
+                    messages = [
+                        "Please complete the transcript analysis first",
+                        "Please complete the learning style quiz first",
+                        "Please fill out your personal information first",
+                        "Please save your profile first"
+                    ]
+                    return (
+                        gr.Tabs(selected=i),
+                        gr.update(
+                            value=f"<div class='error-message'>⛔ {messages[i]}</div>",
+                            visible=True
+                        )
                     )
             return gr.Tabs(selected=tab_index), gr.update(visible=False)