Spaces:

Dannyar608
/

Final_project

Runtime error

App Files Files Community

Dannyar608 commited on May 17

Commit

55e2010

verified ·

1 Parent(s): d3a1938

Update app.py

Browse files

Files changed (1) hide show

app.py +199 -184

app.py CHANGED Viewed

@@ -36,9 +36,9 @@ SESSION_TIMEOUT = 3600  # 1 hour session timeout
 # Initialize logging
 logging.basicConfig(
-    filename='app.log',
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 # Model configuration - Only DeepSeek
@@ -72,67 +72,61 @@ class ModelLoader:
     def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
         """Lazy load the model with progress feedback"""
-        if self.loaded:
-            return self.model, self.tokenizer
-        self.loading = True
-        self.error = None
         try:
             if progress:
-                progress(0.1, desc="Initializing...")
-            # Clear previous model if any
-            if self.model:
-                del self.model
-                del self.tokenizer
-                torch.cuda.empty_cache()
-                time.sleep(2)  # Allow CUDA cleanup
-            # Load with optimized settings
-            model_kwargs = {
-                "trust_remote_code": True,
-                "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
-                "device_map": "auto" if self.device == "cuda" else None,
-                "low_cpu_mem_usage": True
-            }
             if progress:
-                progress(0.3, desc="Loading tokenizer...")
-            self.tokenizer = AutoTokenizer.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True
             )
             if progress:
-                progress(0.6, desc="Loading model...")
-            self.model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME,
-                **model_kwargs
-            ).to(self.device)
-            # Verify model responsiveness
-            if progress:
-                progress(0.8, desc="Verifying model...")
-            test_input = self.tokenizer("Test", return_tensors="pt").to(self.device)
-            _ = self.model.generate(**test_input, max_new_tokens=1)
-            self.model.eval()  # Disable dropout
-            if progress:
-                progress(0.9, desc="Finalizing...")
             self.loaded = True
-            return self.model, self.tokenizer
-        except torch.cuda.OutOfMemoryError:
-            self.error = "Out of GPU memory. Try using CPU instead."
-            logging.error(self.error)
-            return None, None
         except Exception as e:
-            self.error = f"Model loading error: {str(e)}"
             logging.error(self.error)
             return None, None
-        finally:
-            self.loading = False
 # Initialize model loader
 model_loader = ModelLoader()
@@ -285,6 +279,22 @@ def remove_sensitive_info(text: str) -> str:
     text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
     return text
 # ========== TRANSCRIPT PARSING ==========
 class TranscriptParser:
     def __init__(self):
@@ -298,7 +308,7 @@ class TranscriptParser:
         """Parse Miami-Dade formatted transcripts with updated regex patterns."""
         try:
             # First try structured parsing for Miami-Dade format
-            if "Graduation Progress Summary" in text and "Miami-Dade" in text:
                 return self._parse_miami_dade_format(text)
             else:
                 # Fall back to AI parsing if not Miami-Dade format
@@ -309,17 +319,26 @@ class TranscriptParser:
             raise ValueError(f"Couldn't parse transcript: {str(e)}")
     def _parse_miami_dade_format(self, text: str) -> Dict:
-        """Specialized parser for Miami-Dade County Public Schools transcripts."""
-        # Extract student info
         student_match = re.search(
-            r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
-            text, re.DOTALL
         )
         if student_match:
             self.student_data = {
                 "id": student_match.group(1).strip(),
-                "name": student_match.group(2).replace(",", ", ").strip(),
                 "current_grade": student_match.group(3),
                 "graduation_year": student_match.group(4),
                 "unweighted_gpa": float(student_match.group(5)),
@@ -327,6 +346,23 @@ class TranscriptParser:
                 "total_credits": float(student_match.group(7)),
                 "community_service_hours": int(student_match.group(8))
             }
         # Extract requirements
         self.requirements = {}
@@ -504,143 +540,112 @@ def format_transcript_output(data: Dict) -> str:
     return '\n'.join(output)
-def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
-    """Enhanced AI parsing with fallback to structured parsing"""
-    try:
-        # First try structured parsing
-        if progress:
-            progress(0.1, desc="Attempting structured parsing...")
-        parser = TranscriptParser()
-        parsed_data = parser.parse_transcript(text)
-        if progress:
-            progress(0.8, desc="Formatting results...")
-        return parsed_data
-    except Exception as e:
-        logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
-        # Fall back to AI parsing if structured parsing fails
-        return parse_transcript_with_ai_fallback(text, progress)
 def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
-    """Fallback AI parsing method with improved prompt engineering"""
-    # Pre-process the text
-    text = remove_sensitive_info(text[:15000])  # Limit input size
-    prompt = f"""
-    Analyze this academic transcript and extract structured information in JSON format. Follow this exact structure:
-    {{
-        "student_info": {{
-            "name": "Full Name",
-            "id": "Student ID",
-            "current_grade": "Grade Level",
-            "graduation_year": "Year of Graduation",
-            "unweighted_gpa": 0.0,
-            "weighted_gpa": 0.0,
-            "total_credits": 0.0,
-            "community_service_hours": 0
-        }},
-        "requirements": {{
-            "A-English": {{
-                "description": "English requirement description",
-                "required": 4.0,
-                "completed": 4.0,
-                "status": "100%"
-            }}
-        }},
-        "current_courses": [
-            {{
-                "course": "Course Name",
-                "code": "Course Code",
-                "category": "Requirement Category",
-                "term": "Term",
-                "credits": "inProgress or credit value",
-                "grade_level": "Grade Level"
-            }}
-        ],
-        "course_history": [
-            {{
-                "requirement_category": "Category Code",
-                "school_year": "Year Taken",
-                "grade_level": "Grade Level",
-                "course_code": "Course Code",
-                "description": "Course Description",
-                "term": "Term",
-                "grade": "Grade Received",
-                "credits": "Credits Earned"
-            }}
-        ],
-        "graduation_status": {{
-            "total_required_credits": 24.0,
-            "total_completed_credits": 24.0,
-            "percent_complete": 100.0,
-            "remaining_credits": 0.0,
-            "on_track": true
-        }},
-        "format": "miami_dade or standard"
-    }}
-    Transcript Text:
-    {text}
-    """
     try:
         if progress:
-            progress(0.1, desc="Processing transcript with AI...")
         model, tokenizer = get_model_and_tokenizer()
-        if model is None or tokenizer is None:
-            raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
-        # Tokenize and generate response
-        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True).to(model_loader.device)
-        if progress:
-            progress(0.4)
         outputs = model.generate(
             **inputs,
-            max_new_tokens=2000,
-            temperature=0.1,
             do_sample=True,
             top_p=0.9,
-            repetition_penalty=1.1
         )
-        if progress:
-            progress(0.8)
-        # Decode the response
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Extract JSON from response
         try:
-            json_str = response.split('```json')[1].split('```')[0].strip()
-            parsed_data = json.loads(json_str)
-        except (IndexError, json.JSONDecodeError):
-            # Fallback: Try to find JSON in the response
-            json_match = re.search(r'\{.*\}', response, re.DOTALL)
-            if json_match:
-                parsed_data = json.loads(json_match.group())
             else:
-                raise ValueError("Could not extract JSON from AI response")
-        # Validate the parsed data structure
-        required_keys = ["student_info", "requirements", "course_history"]
-        if not all(key in parsed_data for key in required_keys):
-            raise ValueError("AI returned incomplete data structure")
         if progress:
-            progress(1.0)
         return parsed_data
-    except torch.cuda.OutOfMemoryError:
-        raise gr.Error("The model ran out of memory. Try with a smaller transcript.")
     except Exception as e:
-        logging.error(f"AI parsing error: {str(e)}")
-        raise gr.Error(f"Error processing transcript: {str(e)}\n\nPlease try again or contact support with this error message.")
 async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
     """Async wrapper for transcript parsing"""
@@ -665,27 +670,37 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
         if not text.strip():
             raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
-        # Use AI for parsing with progress updates
         if progress:
-            progress(0.4, desc="Analyzing transcript content...")
-        parsed_data = parse_transcript_with_ai(text, progress)
-        # Format output text
         if progress:
-            progress(0.9, desc="Generating report...")
-        output_text = format_transcript_output(parsed_data)
-        return output_text, parsed_data
     except Exception as e:
-        error_msg = f"❌ Error processing transcript: {str(e)}"
         if "PDF" in str(e):
-            error_msg += "\n\nTIPS FOR PDF FILES:\n1. Try opening and re-saving the PDF in a different format\n2. Ensure the PDF isn't password protected\n3. Try taking a screenshot of the transcript and uploading as an image"
         elif "image" in str(e).lower():
-            error_msg += "\n\nTIPS FOR IMAGE FILES:\n1. Ensure the image is clear and well-lit\n2. Try cropping to just the transcript area\n3. Avoid blurry or low-resolution images"
         logging.error(error_msg)
         return error_msg, None
@@ -1484,7 +1499,7 @@ def create_interface():
                     except Exception as e:
                         error_msg = f"Error processing transcript: {str(e)}"
                         if "PDF" in str(e):
-                            error_msg += "\n\nTIPS:\n- Try re-saving the PDF\n- Ensure it's not password protected\n- Try converting to an image"
                         return (
                             error_msg,
                             None,

 # Initialize logging
 logging.basicConfig(
+    filename='transcript_parser.log',
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 # Model configuration - Only DeepSeek
     def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
         """Lazy load the model with progress feedback"""
         try:
             if progress:
+                progress(0.1, desc="Checking GPU availability...")
+            # Clear CUDA cache first
+            torch.cuda.empty_cache()
             if progress:
+                progress(0.2, desc="Loading tokenizer...")
+            tokenizer = AutoTokenizer.from_pretrained(
                 MODEL_NAME,
                 trust_remote_code=True
             )
             if progress:
+                progress(0.5, desc="Loading model (this may take a few minutes)...")
+            # More robust model loading
+            model_kwargs = {
+                "trust_remote_code": True,
+                "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
+                "device_map": "auto" if self.device == "cuda" else None,
+                "low_cpu_mem_usage": True,
+                "offload_folder": "offload"  # For handling large models
+            }
+            try:
+                model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    **model_kwargs
+                )
+            except torch.cuda.OutOfMemoryError:
+                # Fallback to CPU if GPU OOM
+                model_kwargs["device_map"] = None
+                model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    **model_kwargs
+                ).to('cpu')
+                self.device = 'cpu'
+            # Verify model is responsive
+            test_input = tokenizer("Test", return_tensors="pt").to(self.device)
+            _ = model.generate(**test_input, max_new_tokens=1)
+            self.model = model.eval()
+            self.tokenizer = tokenizer
             self.loaded = True
+            return model, tokenizer
         except Exception as e:
+            self.error = f"Model loading failed: {str(e)}"
             logging.error(self.error)
             return None, None
 # Initialize model loader
 model_loader = ModelLoader()
     text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
     return text
+def validate_parsed_data(data: Dict) -> bool:
+    """Validate the structure of parsed transcript data"""
+    required_student_fields = ['name', 'current_grade']
+    required_course_fields = ['description', 'credits']
+    if 'student_info' not in data:
+        return False
+    if not all(field in data['student_info'] for field in required_student_fields):
+        return False
+    if 'course_history' not in data or not isinstance(data['course_history'], list):
+        return False
+    if len(data['course_history']) > 0:
+        if not all(field in data['course_history'][0] for field in required_course_fields):
+            return False
+    return True
 # ========== TRANSCRIPT PARSING ==========
 class TranscriptParser:
     def __init__(self):
         """Parse Miami-Dade formatted transcripts with updated regex patterns."""
         try:
             # First try structured parsing for Miami-Dade format
+            if "Graduation Progress Summary" in text or "Miami-Dade" in text:
                 return self._parse_miami_dade_format(text)
             else:
                 # Fall back to AI parsing if not Miami-Dade format
             raise ValueError(f"Couldn't parse transcript: {str(e)}")
     def _parse_miami_dade_format(self, text: str) -> Dict:
+        """More flexible parser for Miami-Dade County Public Schools transcripts."""
+        # Normalize text first
+        text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces
+        # More flexible student info extraction
         student_match = re.search(
+            r'(?:Student\s*ID[:]?\s*(\d+).*?Name[:]?\s*([A-Za-z\s,]+).*?'
+            r'(?:Grade|Level)[:]?\s*(\d+).*?'
+            r'(?:Grad|YOG)[:]?\s*(\d{4}).*?'
+            r'(?:Unweighted\s*GPA)[:]?\s*([\d.]+).*?'
+            r'(?:Weighted\s*GPA)[:]?\s*([\d.]+).*?'
+            r'(?:Total\s*Credits)[:]?\s*([\d.]+).*?'
+            r'(?:Comm\s*Serv|Service\s*Hours)[:]?\s*(\d+)',
+            text, re.IGNORECASE | re.DOTALL
         )
         if student_match:
             self.student_data = {
                 "id": student_match.group(1).strip(),
+                "name": student_match.group(2).replace(",", ", ").strip().title(),
                 "current_grade": student_match.group(3),
                 "graduation_year": student_match.group(4),
                 "unweighted_gpa": float(student_match.group(5)),
                 "total_credits": float(student_match.group(7)),
                 "community_service_hours": int(student_match.group(8))
             }
+        else:
+            # Fallback pattern if first one fails
+            student_match = re.search(
+                r'(\d{7})\s*(.*?)\s*(?:Grade|Grd)[:]?\s*(\d+)',
+                text, re.IGNORECASE
+            )
+            if student_match:
+                self.student_data = {
+                    "id": student_match.group(1).strip(),
+                    "name": student_match.group(2).strip().title(),
+                    "current_grade": student_match.group(3),
+                    "graduation_year": "",
+                    "unweighted_gpa": 0.0,
+                    "weighted_gpa": 0.0,
+                    "total_credits": 0.0,
+                    "community_service_hours": 0
+                }
         # Extract requirements
         self.requirements = {}
     return '\n'.join(output)
 def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
+    """More robust AI parsing with better error handling"""
     try:
+        text = remove_sensitive_info(text[:20000])  # Increased limit
+        # Improved prompt with examples
+        prompt = f"""Extract academic transcript data as JSON. Follow this structure:
+        Example Input:
+        Student ID: 1234567 Name: DOE, JOHN Current Grade: 12 YOG: 2024
+        Unweighted GPA: 3.5 Weighted GPA: 4.2 Total Credits: 24.5
+        Example Output:
+        {{
+            "student_info": {{
+                "name": "John Doe",
+                "id": "1234567",
+                "current_grade": "12",
+                "graduation_year": "2024",
+                "unweighted_gpa": 3.5,
+                "weighted_gpa": 4.2,
+                "total_credits": 24.5
+            }},
+            "course_history": [
+                {{
+                    "course_code": "MATH101",
+                    "description": "Algebra I",
+                    "grade": "A",
+                    "credits": 1.0,
+                    "school_year": "2022-2023"
+                }}
+            ]
+        }}
+        Actual Transcript:
+        {text}
+        """
         if progress:
+            progress(0.3, desc="Processing with AI...")
         model, tokenizer = get_model_and_tokenizer()
+        if model is None:
+            raise ValueError("Model not loaded")
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model_loader.device)
         outputs = model.generate(
             **inputs,
+            max_new_tokens=2500,
+            temperature=0.3,  # Lower for more consistent results
             do_sample=True,
             top_p=0.9,
+            repetition_penalty=1.2
         )
         response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # More robust JSON extraction
         try:
+            if '```json' in response:
+                json_str = response.split('```json')[1].split('```')[0].strip()
             else:
+                json_str = response.split('{', 1)[1].rsplit('}', 1)[0]
+                json_str = '{' + json_str + '}'
+            parsed_data = json.loads(json_str)
+            # Validate required fields
+            if not all(k in parsed_data for k in ["student_info", "course_history"]):
+                raise ValueError("Missing required fields in AI response")
+            return parsed_data
+        except Exception as e:
+            logging.error(f"JSON parsing failed: {str(e)}")
+            raise ValueError(f"AI returned invalid format. Please try again.")
+    except Exception as e:
+        logging.error(f"AI parsing error: {str(e)}")
+        raise gr.Error(f"Failed to parse transcript: {str(e)}")
+def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
+    """Enhanced AI parsing with fallback to structured parsing"""
+    try:
+        # First try structured parsing
+        if progress:
+            progress(0.1, desc="Attempting structured parsing...")
+        parser = TranscriptParser()
+        parsed_data = parser.parse_transcript(text)
+        # Validate the parsed data
+        if not validate_parsed_data(parsed_data):
+            raise ValueError("Structured parsing returned incomplete data")
         if progress:
+            progress(0.8, desc="Formatting results...")
         return parsed_data
     except Exception as e:
+        logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
+        # Fall back to AI parsing if structured parsing fails
+        return parse_transcript_with_ai_fallback(text, progress)
 async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
     """Async wrapper for transcript parsing"""
         if not text.strip():
             raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
+        # Try structured parsing first
         if progress:
+            progress(0.4, desc="Attempting structured parsing...")
+        parser = TranscriptParser()
+        try:
+            parsed_data = parser.parse_transcript(text)
+            if validate_parsed_data(parsed_data):
+                if progress:
+                    progress(0.9, desc="Formatting results...")
+                return format_transcript_output(parsed_data), parsed_data
+        except Exception as e:
+            logging.warning(f"Structured parsing failed: {str(e)}")
+        # Fall back to AI if structured fails
         if progress:
+            progress(0.5, desc="Using AI analysis...")
+        parsed_data = parse_transcript_with_ai_fallback(text, progress)
+        return format_transcript_output(parsed_data), parsed_data
     except Exception as e:
+        error_msg = f"Error processing transcript: {str(e)}"
+        # Add specific troubleshooting tips
         if "PDF" in str(e):
+            error_msg += "\n\nTIPS:\n1. Try converting to image (screenshot)\n2. Ensure text is selectable in PDF\n3. Try a different PDF reader"
         elif "image" in str(e).lower():
+            error_msg += "\n\nTIPS:\n1. Use high contrast images\n2. Crop to just the transcript\n3. Ensure good lighting"
+        elif "AI" in str(e):
+            error_msg += "\n\nTIPS:\n1. Try a smaller section of the transcript\n2. Check for sensitive info that may be redacted\n3. Try again later"
         logging.error(error_msg)
         return error_msg, None
                     except Exception as e:
                         error_msg = f"Error processing transcript: {str(e)}"
                         if "PDF" in str(e):
+                            error_msg += "\n\nTIPS:\n- Try converting to image (screenshot)\n- Ensure text is selectable in PDF\n- Try a different PDF reader"
                         return (
                             error_msg,
                             None,