Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 17

Commit

d3a1938

verified ·

1 Parent(s): 058e198

Update app.py

Browse files

Files changed (1) hide show

app.py +218 -129

app.py CHANGED Viewed

@@ -297,80 +297,107 @@ class TranscriptParser:
     def parse_transcript(self, text: str) -> Dict:
         """Parse Miami-Dade formatted transcripts with updated regex patterns."""
         try:
-            # Extract student info
-            student_match = re.search(
-                r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
-                text, re.DOTALL
-            )
-            if student_match:
-                self.student_data = {
-                    "id": student_match.group(1).strip(),
-                    "name": student_match.group(2).replace(",", ", ").strip(),
-                    "current_grade": student_match.group(3),
-                    "graduation_year": student_match.group(4),
-                    "unweighted_gpa": float(student_match.group(5)),
-                    "weighted_gpa": float(student_match.group(6)),
-                    "total_credits": float(student_match.group(7)),
-                    "community_service_hours": int(student_match.group(8))
-                }
-            # Extract requirements
-            self.requirements = {}
-            req_section = re.search(
-                r"Code Description Required Waived Completed Status(.*?)Total\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+%",
-                text, re.DOTALL
-            )
-            if req_section:
-                req_lines = req_section.group(1).strip().splitlines()
-                for line in req_lines:
-                    req_match = re.match(r"([A-Z]-[^\s]+)\s+(.+?)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+)%", line.strip())
-                    if req_match:
-                        code = req_match.group(1).strip()
-                        self.requirements[code] = {
-                            "description": req_match.group(2).strip(),
-                            "required": float(req_match.group(3)),
-                            "waived": float(req_match.group(4)),
-                            "completed": float(req_match.group(5)),
-                            "status": f"{req_match.group(6)}%"
-                        }
-            # Extract course history (simplified for now)
-            self.course_history = []
-            course_pattern = re.compile(
-                r"([A-Z]-[^\s]+)\s+(\d{4}-\d{4}|\d{4})\s+(\d{2})\s+([A-Z0-9]+)\s+(.+?)\s+([AT12]+)\s+([A-Z0-9]+)?\s+([A-Z])?\s+([A-Z])?\s+(inProgress|\d+\.\d+)",
-                re.DOTALL
-            )
-            for match in course_pattern.finditer(text):
-                self.course_history.append({
-                    "requirement_category": match.group(1),
-                    "school_year": match.group(2),
-                    "grade_level": match.group(3),
-                    "course_code": match.group(4),
-                    "description": match.group(5).strip(),
-                    "term": match.group(6),
-                    "district_number": match.group(7),
-                    "grade": match.group(8),
-                    "inclusion_status": match.group(9),
-                    "credits": match.group(10)
-                })
-            # Extract in-progress
-            self._extract_current_courses()
-            self._calculate_completion()
-            return {
-                "student_info": self.student_data,
-                "requirements": self.requirements,
-                "current_courses": self.current_courses,
-                "course_history": self.course_history,
-                "graduation_status": self.graduation_status,
-                "format": "miami_dade"
-            }
-        except Exception as e:
-            logging.error(f"Error parsing transcript: {str(e)}")
-            raise ValueError(f"Couldn't parse transcript: {str(e)}")
     def _extract_current_courses(self):
         """Identify in-progress courses."""
@@ -383,7 +410,8 @@ class TranscriptParser:
                 "credits": c["credits"],
                 "grade_level": c["grade_level"]
             }
-            for c in self.course_history if isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
         ]
     def _calculate_completion(self):
@@ -421,8 +449,6 @@ def format_transcript_output(data: Dict) -> str:
         output.append(f"**Total Credits Earned:** {student['total_credits']}")
     if 'community_service_hours' in student:
         output.append(f"**Community Service Hours:** {student['community_service_hours']}")
-    if 'parent' in student:
-        output.append(f"**Parent/Guardian:** {student['parent']}")
     output.append("")
@@ -461,8 +487,8 @@ def format_transcript_output(data: Dict) -> str:
     # Course History by Year
     courses_by_year = defaultdict(list)
     for course in data.get("course_history", []):
-        year_key = course.get("school_year", course.get("completion_date", "Unknown"))
-        courses_by_year[year_key].append(course)
     if courses_by_year:
         output.append("## Course History\n" + '='*50)
@@ -471,7 +497,7 @@ def format_transcript_output(data: Dict) -> str:
             for course in courses_by_year[year]:
                 output.append(
                     f"- **{course.get('course_code', '')} {course.get('description', 'Unnamed course')}**\n"
-                    f"  Subject: {course.get('subject', 'N/A')} | "
                     f"Grade: {course.get('grade', 'N/A')} | "
                     f"Credits: {course.get('credits', 'N/A')}"
                 )
@@ -500,26 +526,63 @@ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
         return parse_transcript_with_ai_fallback(text, progress)
 def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
-    """Fallback AI parsing method"""
     # Pre-process the text
     text = remove_sensitive_info(text[:15000])  # Limit input size
     prompt = f"""
-    Analyze this academic transcript and extract structured information:
-    - Current grade level
-    - Weighted GPA (if available)
-    - Unweighted GPA (if available)
-    - Total credits earned
-    - Community service hours (if available)
-    - List of all courses with:
-      * Course code
-      * Course name
-      * Grade received
-      * Credits earned
-      * Year/semester taken
-      * Grade level when taken
-    - Graduation requirements status
-    Return the data in JSON format.
     Transcript Text:
     {text}
@@ -534,7 +597,7 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
             raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
         # Tokenize and generate response
-        inputs = tokenizer(prompt, return_tensors="pt").to(model_loader.device)
         if progress:
             progress(0.4)
@@ -542,7 +605,9 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
             **inputs,
             max_new_tokens=2000,
             temperature=0.1,
-            do_sample=True
         )
         if progress:
             progress(0.8)
@@ -555,9 +620,17 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
             json_str = response.split('```json')[1].split('```')[0].strip()
             parsed_data = json.loads(json_str)
         except (IndexError, json.JSONDecodeError):
-            # Fallback: Extract JSON-like substring
-            json_str = re.search(r'\{.*\}', response, re.DOTALL).group()
-            parsed_data = json.loads(json_str)
         if progress:
             progress(1.0)
@@ -1003,28 +1076,40 @@ class ProfileManager:
     def _format_transcript(self, transcript: Dict) -> str:
         """Format transcript data for display."""
-        if not transcript or "courses" not in transcript:
             return "_No transcript information available_"
         display = "#### Course History\n"
-        courses_by_grade = transcript["courses"]
-        if isinstance(courses_by_grade, dict):
-            for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
-                display += f"\n**Grade {grade}**\n"
-                for course in courses_by_grade[grade]:
-                    display += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
                     if 'grade' in course and course['grade']:
                         display += f" (Grade: {course['grade']})"
                     if 'credits' in course:
                         display += f" | Credits: {course['credits']}"
-                    display += f" | Year: {course.get('year', 'N/A')}\n"
-        if 'gpa' in transcript:
-            gpa = transcript['gpa']
-            display += "\n**GPA**\n"
-            display += f"- Unweighted: {gpa.get('unweighted', 'N/A')}\n"
-            display += f"- Weighted: {gpa.get('weighted', 'N/A')}\n"
         return display
@@ -1051,10 +1136,10 @@ class TeachingAssistant:
             # Extract profile information
             name = profile.get("name", "there")
             learning_style = profile.get("learning_style", "")
-            grade_level = profile.get("transcript", {}).get("grade_level", "unknown")
-            gpa = profile.get("transcript", {}).get("gpa", {})
             interests = profile.get("interests", "")
-            courses = profile.get("transcript", {}).get("courses", {})
             favorites = profile.get("favorites", {})
             # Process message with context
@@ -1174,19 +1259,18 @@ class TeachingAssistant:
     def _generate_grade_advice(self, profile: Dict) -> str:
         """Generate response about grades and GPA."""
-        gpa = profile.get("transcript", {}).get("gpa", {})
-        courses = profile.get("transcript", {}).get("courses", {})
         response = (f"Your GPA information:\n"
-                   f"- Unweighted: {gpa.get('unweighted', 'N/A')}\n"
-                   f"- Weighted: {gpa.get('weighted', 'N/A')}\n\n")
         # Identify any failing grades
         weak_subjects = []
-        for grade_level, course_list in courses.items():
-            for course in course_list:
-                if course.get('grade', '').upper() in ['D', 'F']:
-                    weak_subjects.append(f"{course.get('code', '')} {course.get('name', 'Unknown course')}")
         if weak_subjects:
             response += ("**Areas for Improvement**:\n"
@@ -1215,14 +1299,19 @@ class TeachingAssistant:
     def _generate_course_advice(self, profile: Dict) -> str:
         """Generate response about courses."""
-        courses = profile.get("transcript", {}).get("courses", {})
-        grade_level = profile.get("transcript", {}).get("grade_level", "unknown")
-        response = "Here's a summary of your courses:\n"
-        for grade in sorted(courses.keys(), key=lambda x: int(x) if x.isdigit() else x):
-            response += f"\n**Grade {grade}**:\n"
-            for course in courses[grade]:
-                response += f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
                 if 'grade' in course:
                     response += f" (Grade: {course['grade']})"
                 response += "\n"

     def parse_transcript(self, text: str) -> Dict:
         """Parse Miami-Dade formatted transcripts with updated regex patterns."""
         try:
+            # First try structured parsing for Miami-Dade format
+            if "Graduation Progress Summary" in text and "Miami-Dade" in text:
+                return self._parse_miami_dade_format(text)
+            else:
+                # Fall back to AI parsing if not Miami-Dade format
+                return parse_transcript_with_ai_fallback(text)
+        except Exception as e:
+            logging.error(f"Error parsing transcript: {str(e)}")
+            raise ValueError(f"Couldn't parse transcript: {str(e)}")
+    def _parse_miami_dade_format(self, text: str) -> Dict:
+        """Specialized parser for Miami-Dade County Public Schools transcripts."""
+        # Extract student info
+        student_match = re.search(
+            r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
+            text, re.DOTALL
+        )
+        if student_match:
+            self.student_data = {
+                "id": student_match.group(1).strip(),
+                "name": student_match.group(2).replace(",", ", ").strip(),
+                "current_grade": student_match.group(3),
+                "graduation_year": student_match.group(4),
+                "unweighted_gpa": float(student_match.group(5)),
+                "weighted_gpa": float(student_match.group(6)),
+                "total_credits": float(student_match.group(7)),
+                "community_service_hours": int(student_match.group(8))
+            }
+        # Extract requirements
+        self.requirements = {}
+        req_section = re.search(
+            r"Code\s+Description\s+Required\s+Waived\s+Completed\s+Status(.*?)Total\s+\d+\.\d+\s+\d+\.\d+\s+\d+\.\d+\s+\d+%",
+            text, re.DOTALL
+        )
+        if req_section:
+            req_lines = req_section.group(1).strip().split('\n')
+            for line in req_lines:
+                line = line.strip()
+                if not line:
+                    continue
+                req_match = re.match(r"([A-Z]-[^\s]+)\s+(.+?)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+)%", line)
+                if req_match:
+                    code = req_match.group(1).strip()
+                    self.requirements[code] = {
+                        "description": req_match.group(2).strip(),
+                        "required": float(req_match.group(3)),
+                        "waived": float(req_match.group(4)),
+                        "completed": float(req_match.group(5)),
+                        "status": f"{req_match.group(6)}%"
+                    }
+        # Extract course history
+        self.course_history = []
+        course_section = re.search(
+            r"Requirement\s+School Year\s+GradeLv1\s+CrsNum\s+Description\s+Term\s+DstNumber\s+FG\s+Incl\s+Credits(.*?)Legend for Incl",
+            text, re.DOTALL
+        )
+        if course_section:
+            course_lines = course_section.group(1).strip().split('\n')
+            for line in course_lines:
+                line = line.strip()
+                if not line or line.startswith('='):
+                    continue
+                # Handle both regular and in-progress courses
+                course_match = re.match(
+                    r"([A-Z]-[^\s]+)?\s*(\d{4}-\d{4}|\d{4})?\s*(\d{2})?\s*([A-Z0-9]+)?\s*(.+?)\s+([AT12]+)?\s*([A-Z0-9]+)?\s*([A-Z])?\s*([A-Z])?\s*(inProgress|\d+\.\d+)?",
+                    line
+                )
+                if course_match:
+                    self.course_history.append({
+                        "requirement_category": course_match.group(1) if course_match.group(1) else None,
+                        "school_year": course_match.group(2) if course_match.group(2) else None,
+                        "grade_level": course_match.group(3) if course_match.group(3) else None,
+                        "course_code": course_match.group(4) if course_match.group(4) else None,
+                        "description": course_match.group(5).strip() if course_match.group(5) else None,
+                        "term": course_match.group(6) if course_match.group(6) else None,
+                        "district_number": course_match.group(7) if course_match.group(7) else None,
+                        "grade": course_match.group(8) if course_match.group(8) else None,
+                        "inclusion_status": course_match.group(9) if course_match.group(9) else None,
+                        "credits": course_match.group(10) if course_match.group(10) else None
+                    })
+        # Extract in-progress courses
+        self._extract_current_courses()
+        self._calculate_completion()
+        return {
+            "student_info": self.student_data,
+            "requirements": self.requirements,
+            "current_courses": self.current_courses,
+            "course_history": self.course_history,
+            "graduation_status": self.graduation_status,
+            "format": "miami_dade"
+        }
     def _extract_current_courses(self):
         """Identify in-progress courses."""
                 "credits": c["credits"],
                 "grade_level": c["grade_level"]
             }
+            for c in self.course_history
+            if c.get("credits") and isinstance(c["credits"], str) and c["credits"].lower() == "inprogress"
         ]
     def _calculate_completion(self):
         output.append(f"**Total Credits Earned:** {student['total_credits']}")
     if 'community_service_hours' in student:
         output.append(f"**Community Service Hours:** {student['community_service_hours']}")
     output.append("")
     # Course History by Year
     courses_by_year = defaultdict(list)
     for course in data.get("course_history", []):
+        if course.get("school_year"):
+            courses_by_year[course["school_year"]].append(course)
     if courses_by_year:
         output.append("## Course History\n" + '='*50)
             for course in courses_by_year[year]:
                 output.append(
                     f"- **{course.get('course_code', '')} {course.get('description', 'Unnamed course')}**\n"
+                    f"  Subject: {course.get('requirement_category', 'N/A')} | "
                     f"Grade: {course.get('grade', 'N/A')} | "
                     f"Credits: {course.get('credits', 'N/A')}"
                 )
         return parse_transcript_with_ai_fallback(text, progress)
 def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
+    """Fallback AI parsing method with improved prompt engineering"""
     # Pre-process the text
     text = remove_sensitive_info(text[:15000])  # Limit input size
     prompt = f"""
+    Analyze this academic transcript and extract structured information in JSON format. Follow this exact structure:
+    {{
+        "student_info": {{
+            "name": "Full Name",
+            "id": "Student ID",
+            "current_grade": "Grade Level",
+            "graduation_year": "Year of Graduation",
+            "unweighted_gpa": 0.0,
+            "weighted_gpa": 0.0,
+            "total_credits": 0.0,
+            "community_service_hours": 0
+        }},
+        "requirements": {{
+            "A-English": {{
+                "description": "English requirement description",
+                "required": 4.0,
+                "completed": 4.0,
+                "status": "100%"
+            }}
+        }},
+        "current_courses": [
+            {{
+                "course": "Course Name",
+                "code": "Course Code",
+                "category": "Requirement Category",
+                "term": "Term",
+                "credits": "inProgress or credit value",
+                "grade_level": "Grade Level"
+            }}
+        ],
+        "course_history": [
+            {{
+                "requirement_category": "Category Code",
+                "school_year": "Year Taken",
+                "grade_level": "Grade Level",
+                "course_code": "Course Code",
+                "description": "Course Description",
+                "term": "Term",
+                "grade": "Grade Received",
+                "credits": "Credits Earned"
+            }}
+        ],
+        "graduation_status": {{
+            "total_required_credits": 24.0,
+            "total_completed_credits": 24.0,
+            "percent_complete": 100.0,
+            "remaining_credits": 0.0,
+            "on_track": true
+        }},
+        "format": "miami_dade or standard"
+    }}
     Transcript Text:
     {text}
             raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
         # Tokenize and generate response
+        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True).to(model_loader.device)
         if progress:
             progress(0.4)
             **inputs,
             max_new_tokens=2000,
             temperature=0.1,
+            do_sample=True,
+            top_p=0.9,
+            repetition_penalty=1.1
         )
         if progress:
             progress(0.8)
             json_str = response.split('```json')[1].split('```')[0].strip()
             parsed_data = json.loads(json_str)
         except (IndexError, json.JSONDecodeError):
+            # Fallback: Try to find JSON in the response
+            json_match = re.search(r'\{.*\}', response, re.DOTALL)
+            if json_match:
+                parsed_data = json.loads(json_match.group())
+            else:
+                raise ValueError("Could not extract JSON from AI response")
+        # Validate the parsed data structure
+        required_keys = ["student_info", "requirements", "course_history"]
+        if not all(key in parsed_data for key in required_keys):
+            raise ValueError("AI returned incomplete data structure")
         if progress:
             progress(1.0)
     def _format_transcript(self, transcript: Dict) -> str:
         """Format transcript data for display."""
+        if not transcript or "course_history" not in transcript:
             return "_No transcript information available_"
         display = "#### Course History\n"
+        courses_by_year = defaultdict(list)
+        for course in transcript.get("course_history", []):
+            if course.get("school_year"):
+                courses_by_year[course["school_year"]].append(course)
+        if courses_by_year:
+            for year in sorted(courses_by_year.keys()):
+                display += f"\n**{year}**\n"
+                for course in courses_by_year[year]:
+                    display += f"- {course.get('course_code', '')} {course.get('description', 'Unnamed course')}"
                     if 'grade' in course and course['grade']:
                         display += f" (Grade: {course['grade']})"
                     if 'credits' in course:
                         display += f" | Credits: {course['credits']}"
+                    display += f" | Category: {course.get('requirement_category', 'N/A')}\n"
+        if 'student_info' in transcript:
+            student = transcript['student_info']
+            display += "\n**Academic Summary**\n"
+            display += f"- Unweighted GPA: {student.get('unweighted_gpa', 'N/A')}\n"
+            display += f"- Weighted GPA: {student.get('weighted_gpa', 'N/A')}\n"
+            display += f"- Total Credits: {student.get('total_credits', 'N/A')}\n"
+        if 'graduation_status' in transcript:
+            status = transcript['graduation_status']
+            display += "\n**Graduation Progress**\n"
+            display += f"- Completion: {status.get('percent_complete', 0)}%\n"
+            display += f"- Credits Required: {status.get('total_required_credits', 0)}\n"
+            display += f"- Credits Completed: {status.get('total_completed_credits', 0)}\n"
+            display += f"- On Track: {'Yes' if status.get('on_track', False) else 'No'}\n"
         return display
             # Extract profile information
             name = profile.get("name", "there")
             learning_style = profile.get("learning_style", "")
+            grade_level = profile.get("transcript", {}).get("student_info", {}).get("current_grade", "unknown")
+            gpa = profile.get("transcript", {}).get("student_info", {})
             interests = profile.get("interests", "")
+            courses = profile.get("transcript", {}).get("course_history", [])
             favorites = profile.get("favorites", {})
             # Process message with context
     def _generate_grade_advice(self, profile: Dict) -> str:
         """Generate response about grades and GPA."""
+        gpa = profile.get("transcript", {}).get("student_info", {})
+        courses = profile.get("transcript", {}).get("course_history", [])
         response = (f"Your GPA information:\n"
+                   f"- Unweighted: {gpa.get('unweighted_gpa', 'N/A')}\n"
+                   f"- Weighted: {gpa.get('weighted_gpa', 'N/A')}\n\n")
         # Identify any failing grades
         weak_subjects = []
+        for course in courses:
+            if course.get('grade', '').upper() in ['D', 'F']:
+                weak_subjects.append(f"{course.get('course_code', '')} {course.get('description', 'Unknown course')}")
         if weak_subjects:
             response += ("**Areas for Improvement**:\n"
     def _generate_course_advice(self, profile: Dict) -> str:
         """Generate response about courses."""
+        courses = profile.get("transcript", {}).get("course_history", [])
+        grade_level = profile.get("transcript", {}).get("student_info", {}).get("current_grade", "unknown")
+        response = "Here's a summary of your courses by year:\n"
+        courses_by_year = defaultdict(list)
+        for course in courses:
+            if course.get("school_year"):
+                courses_by_year[course["school_year"]].append(course)
+        for year in sorted(courses_by_year.keys()):
+            response += f"\n**{year}**:\n"
+            for course in courses_by_year[year]:
+                response += f"- {course.get('course_code', '')} {course.get('description', 'Unnamed course')}"
                 if 'grade' in course:
                     response += f" (Grade: {course['grade']})"
                 response += "\n"