Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 16

Commit

4ed126e

verified ·

1 Parent(s): 7c4445e

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -188

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ import asyncio
 from functools import lru_cache
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
@@ -196,16 +197,20 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
     try:
         if file_ext == '.pdf':
-            # First try PyMuPDF for better text extraction
             try:
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
-                    raise ValueError("PyMuPDF returned empty text - the PDF may be image-based")
-            except Exception as e:
-                logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
-                text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
@@ -293,58 +298,20 @@ class TranscriptParser:
         self.current_courses = []
         self.course_history = []
         self.graduation_status = {}
-        self.supported_formats = {
-            'miami_dade': self.parse_miami_dade,
-            'standard': self.parse_standard,
-            'homeschool': self.parse_homeschool
-        }
     def parse_transcript(self, text: str) -> Dict:
-        """Enhanced parsing method with format detection"""
         try:
-            # First normalize the text (replace multiple spaces, normalize line breaks)
-            text = re.sub(r'\s+', ' ', text)
-            # Detect transcript format
-            format_type = self.detect_format(text)
-            # Parse based on detected format
-            if format_type in self.supported_formats:
-                return self.supported_formats[format_type](text)
-            else:
-                # Fallback to standard parsing
-                return self.parse_standard(text)
-        except Exception as e:
-            logging.error(f"Error parsing transcript: {str(e)}")
-            raise gr.Error(f"Error parsing transcript: {str(e)}\n\nThis may be due to an unsupported transcript format. Please ensure you're uploading an official transcript or contact support.")
-    def detect_format(self, text: str) -> str:
-        """Detect the transcript format"""
-        # Check for Miami-Dade specific patterns
-        if re.search(r'MIAMI-DADE (COUNTY|COUNTRY) PUBLIC SCHOOLS', text, re.IGNORECASE):
-            return 'miami_dade'
-        # Check for homeschool patterns
-        elif re.search(r'homeschool|home education|parent signature', text, re.IGNORECASE):
-            return 'homeschool'
-        # Default to standard format
-        return 'standard'
-    def parse_miami_dade(self, text: str) -> Dict:
-        """Parse Miami-Dade formatted transcripts with enhanced error handling"""
-        try:
-            # Extract student info with more robust patterns
             student_match = re.search(
-                r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+).*?Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
-                r".*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
-                r".*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
                 text, re.DOTALL
             )
             if student_match:
                 self.student_data = {
-                    "id": student_match.group(1),
-                    "name": student_match.group(2).replace(",", ", "),
                     "current_grade": student_match.group(3),
                     "graduation_year": student_match.group(4),
                     "unweighted_gpa": float(student_match.group(5)),
@@ -352,59 +319,51 @@ class TranscriptParser:
                     "total_credits": float(student_match.group(7)),
                     "community_service_hours": int(student_match.group(8))
                 }
-            # Extract requirements with better table parsing
             req_section = re.search(
-                r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status(.*?)Total\s*[\d.]+\s*[\d.]+\s*[\d.]+\s*[\d.]+%",
                 text, re.DOTALL
             )
             if req_section:
-                req_matches = re.finditer(
-                    r"([A-Z]-[\w\s\(\)&]+)\s*([^\n]+?)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%",
-                    req_section.group(1)
-                )
-                for match in req_matches:
-                    req_code = match.group(1).strip()
-                    self.requirements[req_code] = {
-                        "description": match.group(2).strip(),
-                        "required": float(match.group(3)),
-                        "waived": float(match.group(4)),
-                        "completed": float(match.group(5)),
-                        "status": f"{match.group(6)}%"
-                    }
-            # Extract course history with more flexible parsing
-            course_section = re.search(
-                r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description(.*?)(?=Legend for Incl:|$)",
-                text, re.DOTALL
             )
-            if course_section:
-                course_matches = re.finditer(
-                    r"([A-Z]-[\w\s\(\)&-]+)\s*(\d{4}-\d{4}|\d{1,2})\s*(\d{2})\s*([A-Z0-9]+)\s*([^\n]+?)\s*([A-Z0-9]+)\s*([A-Z0-9]+)\s*([A-Z])\s*([A-Z])\s*([\d.]+|inProgress)",
-                    course_section.group(1)
-                )
-                for match in course_matches:
-                    self.course_history.append({
-                        "requirement_category": match.group(1).strip(),
-                        "school_year": match.group(2),
-                        "grade_level": match.group(3),
-                        "course_code": match.group(4),
-                        "description": match.group(5).strip(),
-                        "term": match.group(6),
-                        "district_number": match.group(7),
-                        "grade": match.group(8),
-                        "inclusion_status": match.group(9),
-                        "credits": match.group(10)
-                    })
-            # Identify current courses
             self._extract_current_courses()
             self._calculate_completion()
             return {
                 "student_info": self.student_data,
                 "requirements": self.requirements,
@@ -413,85 +372,13 @@ class TranscriptParser:
                 "graduation_status": self.graduation_status,
                 "format": "miami_dade"
             }
-        except Exception as e:
-            logging.error(f"Error parsing Miami-Dade transcript: {str(e)}")
-            raise ValueError(f"Couldn't parse transcript. Please ensure it's a valid Miami-Dade transcript. Error: {str(e)}")
-    def parse_standard(self, text: str) -> Dict:
-        """Parse standard formatted transcripts"""
-        # Extract student info
-        student_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
-        if student_match:
-            self.student_data["name"] = student_match.group(1).strip()
-        # Extract courses - looking for a table-like structure
-        course_pattern = r"(?P<year>\d{4}-\d{4}|\d{1,2})\s+(?P<subject>\w+)\s+(?P<code>\w+)\s+(?P<title>[^\n]+)\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
-        course_matches = re.finditer(course_pattern, text)
-        for match in course_matches:
-            self.course_history.append({
-                "school_year": match.group("year"),
-                "subject": match.group("subject"),
-                "course_code": match.group("code"),
-                "description": match.group("title").strip(),
-                "grade": match.group("grade"),
-                "credits": match.group("credit")
-            })
-        # Extract GPA info
-        gpa_pattern = r"GPA\s*([\d.]+)\s*/\s*([\d.]+)"
-        gpa_match = re.search(gpa_pattern, text)
-        if gpa_match:
-            self.student_data.update({
-                "unweighted_gpa": float(gpa_match.group(1)),
-                "weighted_gpa": float(gpa_match.group(2))
-            })
-        return {
-            "student_info": self.student_data,
-            "course_history": self.course_history,
-            "format": "standard"
-        }
-    def parse_homeschool(self, text: str) -> Dict:
-        """Parse homeschool formatted transcripts"""
-        # Extract student info
-        name_match = re.search(r"Student:\s*([^\n]+)", text, re.IGNORECASE)
-        if name_match:
-            self.student_data["name"] = name_match.group(1).strip()
-        # Extract homeschool-specific info
-        parent_match = re.search(r"Parent:\s*([^\n]+)", text, re.IGNORECASE)
-        if parent_match:
-            self.student_data["parent"] = parent_match.group(1).strip()
-        # Extract courses - homeschool format often has simpler tables
-        course_pattern = r"(?P<subject>\w+)\s+(?P<title>[^\n]+?)\s+(?P<date>\w+-\d{4})\s+(?P<grade>[A-F][+-]?)\s+(?P<credit>\d\.\d)"
-        course_matches = re.finditer(course_pattern, text)
-        for match in course_matches:
-            self.course_history.append({
-                "subject": match.group("subject"),
-                "description": match.group("title").strip(),
-                "completion_date": match.group("date"),
-                "grade": match.group("grade"),
-                "credits": match.group("credit")
-            })
-        # Extract GPA info
-        gpa_match = re.search(r"Cumulative GPA:\s*([\d.]+)", text, re.IGNORECASE)
-        if gpa_match:
-            self.student_data["gpa"] = float(gpa_match.group(1))
-        return {
-            "student_info": self.student_data,
-            "course_history": self.course_history,
-            "format": "homeschool"
-        }
     def _extract_current_courses(self):
-        """Identify courses currently in progress"""
         self.current_courses = [
             {
                 "course": c["description"],
@@ -501,32 +388,21 @@ class TranscriptParser:
                 "credits": c["credits"],
                 "grade_level": c["grade_level"]
             }
-            for c in self.course_history
-            if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
         ]
     def _calculate_completion(self):
-        """Calculate overall completion status with more detailed info"""
         total_required = sum(req["required"] for req in self.requirements.values())
         total_completed = sum(req["completed"] for req in self.requirements.values())
         self.graduation_status.update({
             "total_required_credits": total_required,
             "total_completed_credits": total_completed,
-            "percent_complete": round((total_completed / total_required) * 100, 1),
             "remaining_credits": total_required - total_completed,
-            "on_track": (total_completed / total_required) >= 0.75  # 75% completion considered on track
         })
-    def to_json(self) -> str:
-        """Export parsed data as JSON"""
-        return json.dumps({
-            "student_info": self.student_data,
-            "requirements": self.requirements,
-            "current_courses": self.current_courses,
-            "course_history": self.course_history,
-            "graduation_status": self.graduation_status
-        }, indent=2)
 def format_transcript_output(data: Dict) -> str:
     """Enhanced formatting for transcript output with format awareness"""

 from functools import lru_cache
 import hashlib
 from concurrent.futures import ThreadPoolExecutor
+import pdfplumber
 # ========== CONFIGURATION ==========
 PROFILES_DIR = "student_profiles"
     try:
         if file_ext == '.pdf':
+            # First try pdfplumber for better text extraction
             try:
+                with pdfplumber.open(file_path) as pdf:
+                    text = "\n".join([page.extract_text() for page in pdf.pages])
+                if not text.strip():
+                    raise ValueError("pdfplumber returned empty text - the PDF may be image-based")
+            except Exception as e:
+                logging.warning(f"pdfplumber failed: {str(e)}. Trying PyMuPDF fallback...")
                 doc = fitz.open(file_path)
                 for page in doc:
                     text += page.get_text("text") + '\n'
                 if not text.strip():
+                    raise ValueError("PyMuPDF returned empty text - trying OCR fallback...")
+                    text = extract_text_from_pdf_with_ocr(file_path)
         elif file_ext in ['.png', '.jpg', '.jpeg']:
             text = extract_text_with_ocr(file_path)
         self.current_courses = []
         self.course_history = []
         self.graduation_status = {}
     def parse_transcript(self, text: str) -> Dict:
+        """Parse Miami-Dade formatted transcripts with updated regex patterns."""
         try:
+            # Extract student info
             student_match = re.search(
+                r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
                 text, re.DOTALL
             )
             if student_match:
                 self.student_data = {
+                    "id": student_match.group(1).strip(),
+                    "name": student_match.group(2).replace(",", ", ").strip(),
                     "current_grade": student_match.group(3),
                     "graduation_year": student_match.group(4),
                     "unweighted_gpa": float(student_match.group(5)),
                     "total_credits": float(student_match.group(7)),
                     "community_service_hours": int(student_match.group(8))
                 }
+            # Extract requirements
+            self.requirements = {}
             req_section = re.search(
+                r"Code Description Required Waived Completed Status(.*?)Total\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+%",
                 text, re.DOTALL
             )
             if req_section:
+                req_lines = req_section.group(1).strip().splitlines()
+                for line in req_lines:
+                    req_match = re.match(r"([A-Z]-[^\s]+)\s+(.+?)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+)%", line.strip())
+                    if req_match:
+                        code = req_match.group(1).strip()
+                        self.requirements[code] = {
+                            "description": req_match.group(2).strip(),
+                            "required": float(req_match.group(3)),
+                            "waived": float(req_match.group(4)),
+                            "completed": float(req_match.group(5)),
+                            "status": f"{req_match.group(6)}%"
+                        }
+            # Extract course history (simplified for now)
+            self.course_history = []
+            course_pattern = re.compile(
+                r"([A-Z]-[^\s]+)\s+(\d{4}-\d{4}|\d{4})\s+(\d{2})\s+([A-Z0-9]+)\s+(.+?)\s+([AT12]+)\s+([A-Z0-9]+)?\s+([A-Z])?\s+([A-Z])?\s+(inProgress|\d+\.\d+)",
+                re.DOTALL
             )
+            for match in course_pattern.finditer(text):
+                self.course_history.append({
+                    "requirement_category": match.group(1),
+                    "school_year": match.group(2),
+                    "grade_level": match.group(3),
+                    "course_code": match.group(4),
+                    "description": match.group(5).strip(),
+                    "term": match.group(6),
+                    "district_number": match.group(7),
+                    "grade": match.group(8),
+                    "inclusion_status": match.group(9),
+                    "credits": match.group(10)
+                })
+            # Extract in-progress
             self._extract_current_courses()
             self._calculate_completion()
             return {
                 "student_info": self.student_data,
                 "requirements": self.requirements,
                 "graduation_status": self.graduation_status,
                 "format": "miami_dade"
             }
+        except Exception as e:
+            logging.error(f"Error parsing transcript: {str(e)}")
+            raise ValueError(f"Couldn't parse transcript: {str(e)}")
     def _extract_current_courses(self):
+        """Identify in-progress courses."""
         self.current_courses = [
             {
                 "course": c["description"],
                 "credits": c["credits"],
                 "grade_level": c["grade_level"]
             }
+            for c in self.course_history if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
         ]
     def _calculate_completion(self):
+        """Compute graduation readiness."""
         total_required = sum(req["required"] for req in self.requirements.values())
         total_completed = sum(req["completed"] for req in self.requirements.values())
         self.graduation_status.update({
             "total_required_credits": total_required,
             "total_completed_credits": total_completed,
+            "percent_complete": round((total_completed / total_required) * 100, 1) if total_required > 0 else 0,
             "remaining_credits": total_required - total_completed,
+            "on_track": (total_completed / total_required) >= 0.75 if total_required > 0 else False
         })
 def format_transcript_output(data: Dict) -> str:
     """Enhanced formatting for transcript output with format awareness"""