Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 18

Commit

a0e5ea9

verified ·

1 Parent(s): 5c437e2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -15

app.py CHANGED Viewed

@@ -175,14 +175,21 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
     try:
         if file_ext == '.pdf':
             try:
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
-                    raise ValueError("PyMuPDF returned empty text")
-            except Exception as e:
-                logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
-                text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
@@ -201,13 +208,22 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
     text = ""
     try:
-        doc = fitz.open(file_path)
-        for page in doc:
-            pix = page.get_pixmap()
-            img = Image.open(io.BytesIO(pix.tobytes()))
-            img = img.convert('L')
-            img = img.point(lambda x: 0 if x < 128 else 255)
-            text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
     except Exception as e:
         raise ValueError(f"PDF OCR failed: {str(e)}")
     return text
@@ -434,18 +450,23 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
         if progress:
             progress(0.2, desc="Extracting text from file...")
-        text = extract_text_from_file(file_obj.name, file_ext)
         if not text.strip():
-            raise ValueError("No text could be extracted from the file.")
         if progress:
             progress(0.5, desc="Parsing transcript...")
         parser = TranscriptParser()
-        parsed_data = parser.parse_transcript(text)
-        # Return simple confirmation message
         confirmation = "Transcript processed successfully."
         if 'gpa' in parsed_data.get('student_info', {}):
             confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
@@ -455,6 +476,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
     except Exception as e:
         error_msg = f"Error processing transcript: {str(e)}"
         logging.error(error_msg)
         return error_msg, None
 # ========== LEARNING STYLE QUIZ ==========

     try:
         if file_ext == '.pdf':
             try:
+                # First try pdfplumber for better table extraction
+                import pdfplumber
+                with pdfplumber.open(file_path) as pdf:
+                    for page in pdf.pages:
+                        text += page.extract_text() + '\n'
+                if not text.strip():
+                    raise ValueError("PDFPlumber returned empty text")
+            except Exception as e:
+                logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
+                    raise ValueError("PyMuPDF returned empty text, trying OCR fallback...")
+                    text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
 def extract_text_from_pdf_with_ocr(file_path: str) -> str:
     text = ""
     try:
+        # Try pdf2image first if available
+        try:
+            from pdf2image import convert_from_path
+            images = convert_from_path(file_path)
+            for img in images:
+                text += pytesseract.image_to_string(img) + '\n'
+            return text
+        except ImportError:
+            # Fallback to PyMuPDF for image extraction
+            doc = fitz.open(file_path)
+            for page in doc:
+                pix = page.get_pixmap()
+                img = Image.open(io.BytesIO(pix.tobytes()))
+                img = img.convert('L')
+                img = img.point(lambda x: 0 if x < 128 else 255)
+                text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
     except Exception as e:
         raise ValueError(f"PDF OCR failed: {str(e)}")
     return text
         if progress:
             progress(0.2, desc="Extracting text from file...")
+        try:
+            text = extract_text_from_file(file_obj.name, file_ext)
+        except Exception as e:
+            raise ValueError(f"Failed to extract text: {str(e)}. The file may be corrupted or in an unsupported format.")
         if not text.strip():
+            raise ValueError("The file appears to be empty or contains no readable text.")
         if progress:
             progress(0.5, desc="Parsing transcript...")
         parser = TranscriptParser()
+        try:
+            parsed_data = parser.parse_transcript(text)
+        except Exception as e:
+            raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
         confirmation = "Transcript processed successfully."
         if 'gpa' in parsed_data.get('student_info', {}):
             confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
     except Exception as e:
         error_msg = f"Error processing transcript: {str(e)}"
         logging.error(error_msg)
+        # Return more detailed error to user
         return error_msg, None
 # ========== LEARNING STYLE QUIZ ==========