Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 10

Commit

0d7fd90

verified ·

1 Parent(s): 8dff1ad

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -160

app.py CHANGED Viewed

@@ -272,115 +272,184 @@ class TranscriptParser:
         self.requirements = {}
         self.current_courses = []
         self.course_history = []
     def parse_transcript(self, text: str) -> Dict:
-        """Main method to parse transcript text"""
-        self._extract_student_info(text)
-        self._extract_requirements(text)
-        self._extract_course_history(text)
-        self._extract_current_courses(text)
-        return {
-            "student_info": self.student_data,
-            "requirements": self.requirements,
-            "current_courses": self.current_courses,
-            "course_history": self.course_history,
-            "completion_status": self._calculate_completion()
-        }
     def _extract_student_info(self, text: str):
-        """Enhanced student info extraction with more robust regex"""
-        # Unified pattern that handles variations in transcript formats
-        header_pattern = (
-            r"(?:Student\s*[:]?\s*|Name\s*[:]?\s*)?"
-            r"(\d{7})\s*[-]?\s*([\w\s,]+?)\s*"
-            r"(?:\||Cohort\s*\w+\s*\||Un-weighted\s*GPA\s*([\d.]+)\s*\||Comm\s*Serv\s*Hours\s*(\d+))?"
-        )
-        header_match = re.search(header_pattern, text, re.IGNORECASE)
-        if header_match:
             self.student_data = {
-                "id": header_match.group(1) if header_match.group(1) else "Unknown",
-                "name": header_match.group(2).strip() if header_match.group(2) else "Unknown",
-                "unweighted_gpa": float(header_match.group(3)) if header_match.group(3) else 0.0,
-                "community_service_hours": int(header_match.group(4)) if header_match.group(4) else 0
             }
-        # More flexible grade info pattern
-        grade_pattern = (
-            r"(?:Grade|Level)\s*[:]?\s*(\d+)\s*"
-            r"(?:\||YOG\s*[:]?\s*(\d{4})\s*\||Weighted\s*GPA\s*([\d.]+)\s*\||Total\s*Credits\s*Earned\s*([\d.]+))?"
-        )
-        grade_match = re.search(grade_pattern, text, re.IGNORECASE)
-        if grade_match:
             self.student_data.update({
-                "current_grade": grade_match.group(1) if grade_match.group(1) else "Unknown",
-                "graduation_year": grade_match.group(2) if grade_match.group(2) else "Unknown",
-                "weighted_gpa": float(grade_match.group(3)) if grade_match.group(3) else 0.0,
-                "total_credits": float(grade_match.group(4)) if grade_match.group(4) else 0.0
             })
     def _extract_requirements(self, text: str):
-        """Parse the graduation requirements section"""
-        req_table = re.findall(
-            r"\|([A-Z]-[\w\s]+)\s*\|([^\|]+)\|([\d.]+)\s*\|([\d.]+)\s*\|([\d.]+)\s*\|([^\|]+)\|",
-            text
         )
-        for row in req_table:
-            req_name = row[0].strip()
-            self.requirements[req_name] = {
-                "required": float(row[2]),
-                "completed": float(row[4]),
-                "status": f"{row[5].strip()}%"
             }
     def _extract_course_history(self, text: str):
-        """Parse the detailed course history"""
-        course_lines = re.findall(
-            r"\|([A-Z]-[\w\s&\(\)]+)\s*\|(\d{4}-\d{4})\s*\|(\d{2})\s*\|([A-Z0-9]+)\s*\|([^\|]+)\|([^\|]+)\|([^\|]+)\|([A-Z])\s*\|([YRXW]?)\s*\|([^\|]+)\|",
-            text
         )
-        for course in course_lines:
             self.course_history.append({
-                "requirement_category": course[0].strip(),
-                "school_year": course[1],
-                "grade_level": course[2],
-                "course_code": course[3],
-                "description": course[4].strip(),
-                "term": course[5].strip(),
-                "district_number": course[6].strip(),
-                "grade": course[7],
-                "inclusion_status": course[8],
-                "credits": course[9].strip()
             })
     def _extract_current_courses(self, text: str):
         """Identify courses currently in progress"""
-        in_progress = [c for c in self.course_history if "inProgress" in c["credits"]]
         self.current_courses = [
             {
                 "course": c["description"],
                 "category": c["requirement_category"],
                 "term": c["term"],
-                "credits": c["credits"]
             }
-            for c in in_progress
         ]
-    def _calculate_completion(self) -> Dict:
-        """Calculate overall completion status"""
         total_required = sum(req["required"] for req in self.requirements.values())
         total_completed = sum(req["completed"] for req in self.requirements.values())
-        return {
-            "total_required": total_required,
-            "total_completed": total_completed,
             "percent_complete": round((total_completed / total_required) * 100, 1),
-            "remaining_credits": total_required - total_completed
-        }
     def to_json(self) -> str:
         """Export parsed data as JSON"""
@@ -389,51 +458,92 @@ class TranscriptParser:
             "requirements": self.requirements,
             "current_courses": self.current_courses,
             "course_history": self.course_history,
-            "completion_status": self._calculate_completion()
         }, indent=2)
-def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
-    """Use AI model to parse transcript text with progress feedback"""
-    model, tokenizer = model_loader.load_model(progress)
-    if model is None or tokenizer is None:
-        raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
-    # First try the structured parser
     try:
         if progress:
-            progress(0.1, desc="Parsing transcript structure...")
         parser = TranscriptParser()
         parsed_data = parser.parse_transcript(text)
         if progress:
-            progress(0.9, desc="Formatting results...")
-        # Convert to expected format
-        formatted_data = {
-            "grade_level": parsed_data["student_info"].get("current_grade", "Unknown"),
-            "gpa": {
-                "weighted": parsed_data["student_info"].get("weighted_gpa", "N/A"),
-                "unweighted": parsed_data["student_info"].get("unweighted_gpa", "N/A")
-            },
-            "courses": []
-        }
-        # Add courses
-        for course in parsed_data["course_history"]:
-            formatted_data["courses"].append({
-                "code": course["course_code"],
-                "name": course["description"],
-                "grade": course["grade"],
-                "credits": course["credits"],
-                "year": course["school_year"],
-                "grade_level": course["grade_level"]
-            })
-        if progress:
-            progress(1.0)
-        return formatted_data
     except Exception as e:
         logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
         # Fall back to AI parsing if structured parsing fails
         return parse_transcript_with_ai_fallback(text, progress)
@@ -447,6 +557,8 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
     - Current grade level
     - Weighted GPA (if available)
     - Unweighted GPA (if available)
     - List of all courses with:
       * Course code
       * Course name
@@ -454,6 +566,7 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
       * Credits earned
       * Year/semester taken
       * Grade level when taken
     Return the data in JSON format.
     Transcript Text:
@@ -464,6 +577,10 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
         if progress:
             progress(0.1, desc="Processing transcript with AI...")
         # Tokenize and generate response
         inputs = tokenizer(prompt, return_tensors="pt").to(model_loader.device)
         if progress:
@@ -471,7 +588,7 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
         outputs = model.generate(
             **inputs,
-            max_new_tokens=1500,
             temperature=0.1,
             do_sample=True
         )
@@ -500,43 +617,8 @@ def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict
         logging.error(f"AI parsing error: {str(e)}")
         raise gr.Error(f"Error processing transcript: {str(e)}")
-def format_transcript_output(data: Dict) -> str:
-    """Format the parsed data into human-readable text."""
-    output = []
-    output.append(f"Student Transcript Summary\n{'='*40}")
-    output.append(f"Current Grade Level: {data.get('grade_level', 'Unknown')}")
-    if 'gpa' in data:
-        output.append(f"\nGPA:")
-        output.append(f"- Weighted: {data['gpa'].get('weighted', 'N/A')}")
-        output.append(f"- Unweighted: {data['gpa'].get('unweighted', 'N/A')}")
-    if 'courses' in data:
-        output.append("\nCourse History:\n" + '='*40)
-        # Group courses by grade level
-        courses_by_grade = defaultdict(list)
-        for course in data['courses']:
-            grade_level = course.get('grade_level', 'Unknown')
-            courses_by_grade[grade_level].append(course)
-        # Sort grades numerically
-        for grade in sorted(courses_by_grade.keys(), key=lambda x: int(x) if x.isdigit() else x):
-            output.append(f"\nGrade {grade}:\n{'-'*30}")
-            for course in courses_by_grade[grade]:
-                course_str = f"- {course.get('code', '')} {course.get('name', 'Unnamed course')}"
-                if 'grade' in course:
-                    course_str += f" (Grade: {course['grade']})"
-                if 'credits' in course:
-                    course_str += f" | Credits: {course['credits']}"
-                if 'year' in course:
-                    course_str += f" | Year: {course['year']}"
-                output.append(course_str)
-    return '\n'.join(output)
 def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
-    """Main function to parse transcript files."""
     try:
         if not file_obj:
             raise ValueError("Please upload a file first")
@@ -544,32 +626,40 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
         validate_file(file_obj)
         file_ext = os.path.splitext(file_obj.name)[1].lower()
-        # Extract text from file
         text = extract_text_from_file(file_obj.name, file_ext)
-        # Use AI for parsing
         parsed_data = parse_transcript_with_ai(text, progress)
         # Format output text
-        output_text = format_transcript_output(parsed_data)
-        # Prepare the data structure for saving
-        transcript_data = {
-            "grade_level": parsed_data.get('grade_level', 'Unknown'),
-            "gpa": parsed_data.get('gpa', {}),
-            "courses": defaultdict(list)
-        }
-        # Organize courses by grade level
-        for course in parsed_data.get('courses', []):
-            grade_level = course.get('grade_level', 'Unknown')
-            transcript_data["courses"][grade_level].append(course)
-        return output_text, transcript_data
     except Exception as e:
-        logging.error(f"Transcript processing error: {str(e)}")
-        return f"Error processing transcript: {str(e)}", None
 # ========== LEARNING STYLE QUIZ ==========
 class LearningStyleQuiz:
@@ -1449,7 +1539,7 @@ def create_interface():
                             "Your profile summary will appear here after saving.",
                             label="Profile Summary"
                         )
-                        blog = gr.Textbox(label="Personal Blog", visible=False)  # Added blog component
                 def save_profile_and_update(name, age, interests, transcript_data, learning_style,
                                          movie, movie_reason, show, show_reason,
@@ -1606,3 +1696,4 @@ app = create_interface()
 if __name__ == "__main__":
     app.launch()

         self.requirements = {}
         self.current_courses = []
         self.course_history = []
+        self.graduation_status = {}
     def parse_transcript(self, text: str) -> Dict:
+        """Enhanced parsing method for Miami-Dade format"""
+        try:
+            # First normalize the text (replace multiple spaces, normalize line breaks)
+            text = re.sub(r'\s+', ' ', text)
+            # Extract student info with more flexible patterns
+            self._extract_student_info(text)
+            # Extract requirements with better table parsing
+            self._extract_requirements(text)
+            # Extract course history with improved pattern matching
+            self._extract_course_history(text)
+            # Identify current courses
+            self._extract_current_courses(text)
+            # Calculate completion status
+            self._calculate_completion()
+            return {
+                "student_info": self.student_data,
+                "requirements": self.requirements,
+                "current_courses": self.current_courses,
+                "course_history": self.course_history,
+                "graduation_status": self.graduation_status
+            }
+        except Exception as e:
+            logging.error(f"Error parsing transcript: {str(e)}")
+            raise gr.Error(f"Error parsing transcript: {str(e)}")
     def _extract_student_info(self, text: str):
+        """Enhanced student info extraction for Miami-Dade format"""
+        # Extract basic student info
+        student_pattern = r"(\d{7})\s*-\s*([A-Z]+,\s*[A-Z]+)\s*Current Grade:\s*(\d+)\s*YOG\s*(\d{4})"
+        student_match = re.search(student_pattern, text, re.IGNORECASE)
+        if student_match:
             self.student_data = {
+                "id": student_match.group(1),
+                "name": student_match.group(2).replace(",", ", "),
+                "current_grade": student_match.group(3),
+                "graduation_year": student_match.group(4)
             }
+        # Extract GPA info
+        gpa_pattern = r"Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+)"
+        gpa_match = re.search(gpa_pattern, text, re.IGNORECASE)
+        if gpa_match:
             self.student_data.update({
+                "unweighted_gpa": float(gpa_match.group(1)),
+                "weighted_gpa": float(gpa_match.group(2))
+            })
+        # Extract credits and service hours
+        credits_pattern = r"Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)"
+        credits_match = re.search(credits_pattern, text, re.IGNORECASE)
+        if credits_match:
+            self.student_data.update({
+                "total_credits": float(credits_match.group(1)),
+                "community_service_hours": int(credits_match.group(2))
             })
     def _extract_requirements(self, text: str):
+        """Parse the graduation requirements section with improved table parsing"""
+        # Find the requirements table
+        req_table_start = re.search(r"Code\s*Description\s*Required\s*Waived\s*Completed\s*Status", text)
+        if not req_table_start:
+            raise ValueError("Could not find requirements table header")
+        req_text = text[req_table_start.start():]
+        # Extract individual requirements
+        req_pattern = (
+            r"([A-Z]-[\w\s\(\)&]+)\s*"  # Code
+            r"([^\|]+)\s*"  # Description
+            r"([\d.]+)\s*"  # Required
+            r"([\d.]+)\s*"  # Waived
+            r"([\d.]+)\s*"  # Completed
+            r"([\d.]+)\s*%"  # Status
         )
+        req_matches = re.finditer(req_pattern, req_text)
+        for match in req_matches:
+            req_code = match.group(1).strip()
+            self.requirements[req_code] = {
+                "description": match.group(2).strip(),
+                "required": float(match.group(3)),
+                "waived": float(match.group(4)),
+                "completed": float(match.group(5)),
+                "status": f"{match.group(6)}%"
+            }
+        # Extract total requirements
+        total_pattern = r"Total\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)\s*([\d.]+)%"
+        total_match = re.search(total_pattern, req_text)
+        if total_match:
+            self.graduation_status["total_requirements"] = {
+                "required": float(total_match.group(1)),
+                "waived": float(total_match.group(2)),
+                "completed": float(total_match.group(3)),
+                "percent_complete": float(total_match.group(4))
             }
     def _extract_course_history(self, text: str):
+        """Parse the detailed course history with improved pattern matching"""
+        # Find the course history table
+        course_header = re.search(r"Requirement\s*School Year\s*GradeLv1\s*CrsNu m\s*Description\s*Term\s*DstNumber\s*FG\s*Incl\s*Credits", text)
+        if not course_header:
+            raise ValueError("Could not find course history table header")
+        course_text = text[course_header.start():]
+        # Extract individual courses
+        course_pattern = (
+            r"([A-Z]-[\w\s\(\)&-]+)\s*"  # Requirement
+            r"(\d{4}-\d{4})\s*"  # School Year
+            r"(\d{2})\s*"  # Grade Level
+            r"([A-Z0-9]+)\s*"  # Course Number
+            r"([^\|]+)\s*"  # Description
+            r"([A-Z0-9]+)\s*"  # Term
+            r"([A-Z0-9]+)\s*"  # District Number
+            r"([A-Z])\s*"  # Final Grade
+            r"([A-Z])\s*"  # Inclusion Status
+            r"([\d.]+|inProgress)"  # Credits
         )
+        course_matches = re.finditer(course_pattern, course_text)
+        for match in course_matches:
             self.course_history.append({
+                "requirement_category": match.group(1).strip(),
+                "school_year": match.group(2),
+                "grade_level": match.group(3),
+                "course_code": match.group(4),
+                "description": match.group(5).strip(),
+                "term": match.group(6),
+                "district_number": match.group(7),
+                "grade": match.group(8),
+                "inclusion_status": match.group(9),
+                "credits": match.group(10)
             })
     def _extract_current_courses(self, text: str):
         """Identify courses currently in progress"""
         self.current_courses = [
             {
                 "course": c["description"],
+                "code": c["course_code"],
                 "category": c["requirement_category"],
                 "term": c["term"],
+                "credits": c["credits"],
+                "grade_level": c["grade_level"]
             }
+            for c in self.course_history
+            if c["credits"].lower() == "inprogress"
         ]
+    def _calculate_completion(self):
+        """Calculate overall completion status with more detailed info"""
         total_required = sum(req["required"] for req in self.requirements.values())
         total_completed = sum(req["completed"] for req in self.requirements.values())
+        self.graduation_status.update({
+            "total_required_credits": total_required,
+            "total_completed_credits": total_completed,
             "percent_complete": round((total_completed / total_required) * 100, 1),
+            "remaining_credits": total_required - total_completed,
+            "on_track": (total_completed / total_required) >= 0.75  # 75% completion considered on track
+        })
     def to_json(self) -> str:
         """Export parsed data as JSON"""
             "requirements": self.requirements,
             "current_courses": self.current_courses,
             "course_history": self.course_history,
+            "graduation_status": self.graduation_status
         }, indent=2)
+def format_transcript_output(data: Dict) -> str:
+    """Enhanced formatting for Miami-Dade transcript output"""
+    output = []
+    # Student Info Section
+    student = data.get("student_info", {})
+    output.append(f"## Student Transcript Summary\n{'='*50}")
+    output.append(f"**Name:** {student.get('name', 'Unknown')}")
+    output.append(f"**Student ID:** {student.get('id', 'Unknown')}")
+    output.append(f"**Current Grade:** {student.get('current_grade', 'Unknown')}")
+    output.append(f"**Graduation Year:** {student.get('graduation_year', 'Unknown')}")
+    output.append(f"**Unweighted GPA:** {student.get('unweighted_gpa', 'N/A')}")
+    output.append(f"**Weighted GPA:** {student.get('weighted_gpa', 'N/A')}")
+    output.append(f"**Total Credits Earned:** {student.get('total_credits', 'N/A')}")
+    output.append(f"**Community Service Hours:** {student.get('community_service_hours', 'N/A')}\n")
+    # Graduation Requirements Section
+    grad_status = data.get("graduation_status", {})
+    output.append(f"## Graduation Progress\n{'='*50}")
+    output.append(f"**Overall Completion:** {grad_status.get('percent_complete', 0)}%")
+    output.append(f"**Credits Required:** {grad_status.get('total_required_credits', 0)}")
+    output.append(f"**Credits Completed:** {grad_status.get('total_completed_credits', 0)}")
+    output.append(f"**Credits Remaining:** {grad_status.get('remaining_credits', 0)}")
+    output.append(f"**On Track to Graduate:** {'Yes' if grad_status.get('on_track', False) else 'No'}\n")
+    # Detailed Requirements
+    output.append("### Detailed Requirements:")
+    for code, req in data.get("requirements", {}).items():
+        output.append(
+            f"- **{code}**: {req.get('description', '')}\n"
+            f"  Required: {req['required']} | Completed: {req['completed']} | "
+            f"Status: {req['status']}"
+        )
+    output.append("")
+    # Current Courses
+    if data.get("current_courses"):
+        output.append("## Current Courses (In Progress)\n" + '='*50)
+        for course in data["current_courses"]:
+            output.append(
+                f"- **{course['code']} {course['course']}**\n"
+                f"  Category: {course['category']} | "
+                f"Grade Level: {course['grade_level']} | "
+                f"Term: {course['term']} | Credits: {course['credits']}"
+            )
+        output.append("")
+    # Course History by Year
+    courses_by_year = defaultdict(list)
+    for course in data.get("course_history", []):
+        courses_by_year[course["school_year"]].append(course)
+    if courses_by_year:
+        output.append("## Course History\n" + '='*50)
+        for year in sorted(courses_by_year.keys()):
+            output.append(f"\n### {year}")
+            for course in courses_by_year[year]:
+                output.append(
+                    f"- **{course['course_code']} {course['description']}**\n"
+                    f"  Grade: {course['grade']} | Credits: {course['credits']} | "
+                    f"Category: {course['requirement_category']} | Term: {course['term']}"
+                )
+    return '\n'.join(output)
+def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
+    """Enhanced AI parsing with fallback to structured parsing"""
     try:
+        # First try structured parsing
         if progress:
+            progress(0.1, desc="Attempting structured parsing...")
         parser = TranscriptParser()
         parsed_data = parser.parse_transcript(text)
         if progress:
+            progress(0.8, desc="Formatting results...")
+        return parsed_data
     except Exception as e:
         logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
         # Fall back to AI parsing if structured parsing fails
         return parse_transcript_with_ai_fallback(text, progress)
     - Current grade level
     - Weighted GPA (if available)
     - Unweighted GPA (if available)
+    - Total credits earned
+    - Community service hours (if available)
     - List of all courses with:
       * Course code
       * Course name
       * Credits earned
       * Year/semester taken
       * Grade level when taken
+    - Graduation requirements status
     Return the data in JSON format.
     Transcript Text:
         if progress:
             progress(0.1, desc="Processing transcript with AI...")
+        model, tokenizer = model_loader.load_model(progress)
+        if model is None or tokenizer is None:
+            raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
         # Tokenize and generate response
         inputs = tokenizer(prompt, return_tensors="pt").to(model_loader.device)
         if progress:
         outputs = model.generate(
             **inputs,
+            max_new_tokens=2000,
             temperature=0.1,
             do_sample=True
         )
         logging.error(f"AI parsing error: {str(e)}")
         raise gr.Error(f"Error processing transcript: {str(e)}")
 def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
+    """Main function to parse transcript files with better error handling"""
     try:
         if not file_obj:
             raise ValueError("Please upload a file first")
         validate_file(file_obj)
         file_ext = os.path.splitext(file_obj.name)[1].lower()
+        # Extract text from file with better error reporting
+        if progress:
+            progress(0.2, desc="Extracting text from file...")
         text = extract_text_from_file(file_obj.name, file_ext)
+        if not text.strip():
+            raise ValueError("No text could be extracted from the file")
+        # Use AI for parsing with progress updates
+        if progress:
+            progress(0.4, desc="Analyzing transcript content...")
         parsed_data = parse_transcript_with_ai(text, progress)
         # Format output text
+        if progress:
+            progress(0.9, desc="Generating report...")
+        output_text = format_transcript_output(parsed_data)
+        return output_text, parsed_data
     except Exception as e:
+        error_msg = f"Error processing transcript: {str(e)}"
+        logging.error(error_msg)
+        # Provide helpful tips based on error type
+        if "No text could be extracted" in str(e):
+            error_msg += "\n\nTips: Please ensure your file is clear and readable. Try scanning at a higher resolution if it's an image."
+        elif "requirements table header" in str(e):
+            error_msg += "\n\nTips: This appears to be an unsupported transcript format. Please contact support."
+        return error_msg, None
 # ========== LEARNING STYLE QUIZ ==========
 class LearningStyleQuiz:
                             "Your profile summary will appear here after saving.",
                             label="Profile Summary"
                         )
+                        blog = gr.Textbox(label="Personal Blog", visible=False)
                 def save_profile_and_update(name, age, interests, transcript_data, learning_style,
                                          movie, movie_reason, show, show_reason,
 if __name__ == "__main__":
     app.launch()