Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -36,9 +36,9 @@ SESSION_TIMEOUT = 3600 # 1 hour session timeout
|
|
36 |
|
37 |
# Initialize logging
|
38 |
logging.basicConfig(
|
39 |
-
filename='
|
40 |
-
level=logging.
|
41 |
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
42 |
)
|
43 |
|
44 |
# Model configuration - Only DeepSeek
|
@@ -72,67 +72,61 @@ class ModelLoader:
|
|
72 |
|
73 |
def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
|
74 |
"""Lazy load the model with progress feedback"""
|
75 |
-
if self.loaded:
|
76 |
-
return self.model, self.tokenizer
|
77 |
-
|
78 |
-
self.loading = True
|
79 |
-
self.error = None
|
80 |
-
|
81 |
try:
|
82 |
if progress:
|
83 |
-
progress(0.1, desc="
|
84 |
|
85 |
-
# Clear
|
86 |
-
|
87 |
-
del self.model
|
88 |
-
del self.tokenizer
|
89 |
-
torch.cuda.empty_cache()
|
90 |
-
time.sleep(2) # Allow CUDA cleanup
|
91 |
-
|
92 |
-
# Load with optimized settings
|
93 |
-
model_kwargs = {
|
94 |
-
"trust_remote_code": True,
|
95 |
-
"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
|
96 |
-
"device_map": "auto" if self.device == "cuda" else None,
|
97 |
-
"low_cpu_mem_usage": True
|
98 |
-
}
|
99 |
|
100 |
if progress:
|
101 |
-
progress(0.
|
102 |
-
|
|
|
103 |
MODEL_NAME,
|
104 |
trust_remote_code=True
|
105 |
)
|
106 |
|
107 |
if progress:
|
108 |
-
progress(0.
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
self.model.eval()
|
121 |
-
|
122 |
-
progress(0.9, desc="Finalizing...")
|
123 |
self.loaded = True
|
124 |
-
return self.model, self.tokenizer
|
125 |
|
126 |
-
|
127 |
-
|
128 |
-
logging.error(self.error)
|
129 |
-
return None, None
|
130 |
except Exception as e:
|
131 |
-
self.error = f"Model loading
|
132 |
logging.error(self.error)
|
133 |
return None, None
|
134 |
-
finally:
|
135 |
-
self.loading = False
|
136 |
|
137 |
# Initialize model loader
|
138 |
model_loader = ModelLoader()
|
@@ -285,6 +279,22 @@ def remove_sensitive_info(text: str) -> str:
|
|
285 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
286 |
return text
|
287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
# ========== TRANSCRIPT PARSING ==========
|
289 |
class TranscriptParser:
|
290 |
def __init__(self):
|
@@ -298,7 +308,7 @@ class TranscriptParser:
|
|
298 |
"""Parse Miami-Dade formatted transcripts with updated regex patterns."""
|
299 |
try:
|
300 |
# First try structured parsing for Miami-Dade format
|
301 |
-
if "Graduation Progress Summary" in text
|
302 |
return self._parse_miami_dade_format(text)
|
303 |
else:
|
304 |
# Fall back to AI parsing if not Miami-Dade format
|
@@ -309,17 +319,26 @@ class TranscriptParser:
|
|
309 |
raise ValueError(f"Couldn't parse transcript: {str(e)}")
|
310 |
|
311 |
def _parse_miami_dade_format(self, text: str) -> Dict:
|
312 |
-
"""
|
313 |
-
#
|
|
|
|
|
|
|
314 |
student_match = re.search(
|
315 |
-
r
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
)
|
318 |
|
319 |
if student_match:
|
320 |
self.student_data = {
|
321 |
"id": student_match.group(1).strip(),
|
322 |
-
"name": student_match.group(2).replace(",", ", ").strip(),
|
323 |
"current_grade": student_match.group(3),
|
324 |
"graduation_year": student_match.group(4),
|
325 |
"unweighted_gpa": float(student_match.group(5)),
|
@@ -327,6 +346,23 @@ class TranscriptParser:
|
|
327 |
"total_credits": float(student_match.group(7)),
|
328 |
"community_service_hours": int(student_match.group(8))
|
329 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
331 |
# Extract requirements
|
332 |
self.requirements = {}
|
@@ -504,143 +540,112 @@ def format_transcript_output(data: Dict) -> str:
|
|
504 |
|
505 |
return '\n'.join(output)
|
506 |
|
507 |
-
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
508 |
-
"""Enhanced AI parsing with fallback to structured parsing"""
|
509 |
-
try:
|
510 |
-
# First try structured parsing
|
511 |
-
if progress:
|
512 |
-
progress(0.1, desc="Attempting structured parsing...")
|
513 |
-
|
514 |
-
parser = TranscriptParser()
|
515 |
-
parsed_data = parser.parse_transcript(text)
|
516 |
-
|
517 |
-
if progress:
|
518 |
-
progress(0.8, desc="Formatting results...")
|
519 |
-
|
520 |
-
return parsed_data
|
521 |
-
|
522 |
-
except Exception as e:
|
523 |
-
logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
|
524 |
-
|
525 |
-
# Fall back to AI parsing if structured parsing fails
|
526 |
-
return parse_transcript_with_ai_fallback(text, progress)
|
527 |
-
|
528 |
def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
|
529 |
-
"""
|
530 |
-
# Pre-process the text
|
531 |
-
text = remove_sensitive_info(text[:15000]) # Limit input size
|
532 |
-
|
533 |
-
prompt = f"""
|
534 |
-
Analyze this academic transcript and extract structured information in JSON format. Follow this exact structure:
|
535 |
-
|
536 |
-
{{
|
537 |
-
"student_info": {{
|
538 |
-
"name": "Full Name",
|
539 |
-
"id": "Student ID",
|
540 |
-
"current_grade": "Grade Level",
|
541 |
-
"graduation_year": "Year of Graduation",
|
542 |
-
"unweighted_gpa": 0.0,
|
543 |
-
"weighted_gpa": 0.0,
|
544 |
-
"total_credits": 0.0,
|
545 |
-
"community_service_hours": 0
|
546 |
-
}},
|
547 |
-
"requirements": {{
|
548 |
-
"A-English": {{
|
549 |
-
"description": "English requirement description",
|
550 |
-
"required": 4.0,
|
551 |
-
"completed": 4.0,
|
552 |
-
"status": "100%"
|
553 |
-
}}
|
554 |
-
}},
|
555 |
-
"current_courses": [
|
556 |
-
{{
|
557 |
-
"course": "Course Name",
|
558 |
-
"code": "Course Code",
|
559 |
-
"category": "Requirement Category",
|
560 |
-
"term": "Term",
|
561 |
-
"credits": "inProgress or credit value",
|
562 |
-
"grade_level": "Grade Level"
|
563 |
-
}}
|
564 |
-
],
|
565 |
-
"course_history": [
|
566 |
-
{{
|
567 |
-
"requirement_category": "Category Code",
|
568 |
-
"school_year": "Year Taken",
|
569 |
-
"grade_level": "Grade Level",
|
570 |
-
"course_code": "Course Code",
|
571 |
-
"description": "Course Description",
|
572 |
-
"term": "Term",
|
573 |
-
"grade": "Grade Received",
|
574 |
-
"credits": "Credits Earned"
|
575 |
-
}}
|
576 |
-
],
|
577 |
-
"graduation_status": {{
|
578 |
-
"total_required_credits": 24.0,
|
579 |
-
"total_completed_credits": 24.0,
|
580 |
-
"percent_complete": 100.0,
|
581 |
-
"remaining_credits": 0.0,
|
582 |
-
"on_track": true
|
583 |
-
}},
|
584 |
-
"format": "miami_dade or standard"
|
585 |
-
}}
|
586 |
-
|
587 |
-
Transcript Text:
|
588 |
-
{text}
|
589 |
-
"""
|
590 |
-
|
591 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
592 |
if progress:
|
593 |
-
progress(0.
|
594 |
|
595 |
model, tokenizer = get_model_and_tokenizer()
|
596 |
-
if model is None
|
597 |
-
raise
|
598 |
-
|
599 |
-
|
600 |
-
inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True).to(model_loader.device)
|
601 |
-
if progress:
|
602 |
-
progress(0.4)
|
603 |
|
604 |
outputs = model.generate(
|
605 |
**inputs,
|
606 |
-
max_new_tokens=
|
607 |
-
temperature=0.
|
608 |
do_sample=True,
|
609 |
top_p=0.9,
|
610 |
-
repetition_penalty=1.
|
611 |
)
|
612 |
-
if progress:
|
613 |
-
progress(0.8)
|
614 |
|
615 |
-
# Decode the response
|
616 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
617 |
|
618 |
-
#
|
619 |
try:
|
620 |
-
|
621 |
-
|
622 |
-
except (IndexError, json.JSONDecodeError):
|
623 |
-
# Fallback: Try to find JSON in the response
|
624 |
-
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
625 |
-
if json_match:
|
626 |
-
parsed_data = json.loads(json_match.group())
|
627 |
else:
|
628 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
|
630 |
-
|
631 |
-
|
632 |
-
if not all(key in parsed_data for key in required_keys):
|
633 |
-
raise ValueError("AI returned incomplete data structure")
|
634 |
|
|
|
|
|
|
|
|
|
635 |
if progress:
|
636 |
-
progress(
|
|
|
637 |
return parsed_data
|
638 |
-
|
639 |
-
except torch.cuda.OutOfMemoryError:
|
640 |
-
raise gr.Error("The model ran out of memory. Try with a smaller transcript.")
|
641 |
except Exception as e:
|
642 |
-
logging.
|
643 |
-
|
|
|
|
|
644 |
|
645 |
async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
646 |
"""Async wrapper for transcript parsing"""
|
@@ -665,27 +670,37 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
|
|
665 |
if not text.strip():
|
666 |
raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
|
667 |
|
668 |
-
#
|
669 |
if progress:
|
670 |
-
progress(0.4, desc="
|
671 |
-
|
672 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
673 |
|
674 |
-
#
|
675 |
if progress:
|
676 |
-
progress(0.
|
677 |
-
|
678 |
-
|
|
|
679 |
|
680 |
-
return output_text, parsed_data
|
681 |
-
|
682 |
except Exception as e:
|
683 |
-
error_msg = f"
|
|
|
684 |
if "PDF" in str(e):
|
685 |
-
error_msg += "\n\nTIPS
|
686 |
elif "image" in str(e).lower():
|
687 |
-
error_msg += "\n\nTIPS
|
688 |
-
|
|
|
|
|
689 |
logging.error(error_msg)
|
690 |
return error_msg, None
|
691 |
|
@@ -1484,7 +1499,7 @@ def create_interface():
|
|
1484 |
except Exception as e:
|
1485 |
error_msg = f"Error processing transcript: {str(e)}"
|
1486 |
if "PDF" in str(e):
|
1487 |
-
error_msg += "\n\nTIPS:\n- Try
|
1488 |
return (
|
1489 |
error_msg,
|
1490 |
None,
|
|
|
36 |
|
37 |
# Initialize logging
|
38 |
logging.basicConfig(
|
39 |
+
filename='transcript_parser.log',
|
40 |
+
level=logging.DEBUG,
|
41 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
42 |
)
|
43 |
|
44 |
# Model configuration - Only DeepSeek
|
|
|
72 |
|
73 |
def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
|
74 |
"""Lazy load the model with progress feedback"""
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
try:
|
76 |
if progress:
|
77 |
+
progress(0.1, desc="Checking GPU availability...")
|
78 |
|
79 |
+
# Clear CUDA cache first
|
80 |
+
torch.cuda.empty_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
if progress:
|
83 |
+
progress(0.2, desc="Loading tokenizer...")
|
84 |
+
|
85 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
86 |
MODEL_NAME,
|
87 |
trust_remote_code=True
|
88 |
)
|
89 |
|
90 |
if progress:
|
91 |
+
progress(0.5, desc="Loading model (this may take a few minutes)...")
|
92 |
+
|
93 |
+
# More robust model loading
|
94 |
+
model_kwargs = {
|
95 |
+
"trust_remote_code": True,
|
96 |
+
"torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
|
97 |
+
"device_map": "auto" if self.device == "cuda" else None,
|
98 |
+
"low_cpu_mem_usage": True,
|
99 |
+
"offload_folder": "offload" # For handling large models
|
100 |
+
}
|
101 |
|
102 |
+
try:
|
103 |
+
model = AutoModelForCausalLM.from_pretrained(
|
104 |
+
MODEL_NAME,
|
105 |
+
**model_kwargs
|
106 |
+
)
|
107 |
+
except torch.cuda.OutOfMemoryError:
|
108 |
+
# Fallback to CPU if GPU OOM
|
109 |
+
model_kwargs["device_map"] = None
|
110 |
+
model = AutoModelForCausalLM.from_pretrained(
|
111 |
+
MODEL_NAME,
|
112 |
+
**model_kwargs
|
113 |
+
).to('cpu')
|
114 |
+
self.device = 'cpu'
|
115 |
+
|
116 |
+
# Verify model is responsive
|
117 |
+
test_input = tokenizer("Test", return_tensors="pt").to(self.device)
|
118 |
+
_ = model.generate(**test_input, max_new_tokens=1)
|
119 |
|
120 |
+
self.model = model.eval()
|
121 |
+
self.tokenizer = tokenizer
|
|
|
122 |
self.loaded = True
|
|
|
123 |
|
124 |
+
return model, tokenizer
|
125 |
+
|
|
|
|
|
126 |
except Exception as e:
|
127 |
+
self.error = f"Model loading failed: {str(e)}"
|
128 |
logging.error(self.error)
|
129 |
return None, None
|
|
|
|
|
130 |
|
131 |
# Initialize model loader
|
132 |
model_loader = ModelLoader()
|
|
|
279 |
text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
|
280 |
return text
|
281 |
|
282 |
+
def validate_parsed_data(data: Dict) -> bool:
|
283 |
+
"""Validate the structure of parsed transcript data"""
|
284 |
+
required_student_fields = ['name', 'current_grade']
|
285 |
+
required_course_fields = ['description', 'credits']
|
286 |
+
|
287 |
+
if 'student_info' not in data:
|
288 |
+
return False
|
289 |
+
if not all(field in data['student_info'] for field in required_student_fields):
|
290 |
+
return False
|
291 |
+
if 'course_history' not in data or not isinstance(data['course_history'], list):
|
292 |
+
return False
|
293 |
+
if len(data['course_history']) > 0:
|
294 |
+
if not all(field in data['course_history'][0] for field in required_course_fields):
|
295 |
+
return False
|
296 |
+
return True
|
297 |
+
|
298 |
# ========== TRANSCRIPT PARSING ==========
|
299 |
class TranscriptParser:
|
300 |
def __init__(self):
|
|
|
308 |
"""Parse Miami-Dade formatted transcripts with updated regex patterns."""
|
309 |
try:
|
310 |
# First try structured parsing for Miami-Dade format
|
311 |
+
if "Graduation Progress Summary" in text or "Miami-Dade" in text:
|
312 |
return self._parse_miami_dade_format(text)
|
313 |
else:
|
314 |
# Fall back to AI parsing if not Miami-Dade format
|
|
|
319 |
raise ValueError(f"Couldn't parse transcript: {str(e)}")
|
320 |
|
321 |
def _parse_miami_dade_format(self, text: str) -> Dict:
|
322 |
+
"""More flexible parser for Miami-Dade County Public Schools transcripts."""
|
323 |
+
# Normalize text first
|
324 |
+
text = re.sub(r'\s+', ' ', text) # Collapse multiple spaces
|
325 |
+
|
326 |
+
# More flexible student info extraction
|
327 |
student_match = re.search(
|
328 |
+
r'(?:Student\s*ID[:]?\s*(\d+).*?Name[:]?\s*([A-Za-z\s,]+).*?'
|
329 |
+
r'(?:Grade|Level)[:]?\s*(\d+).*?'
|
330 |
+
r'(?:Grad|YOG)[:]?\s*(\d{4}).*?'
|
331 |
+
r'(?:Unweighted\s*GPA)[:]?\s*([\d.]+).*?'
|
332 |
+
r'(?:Weighted\s*GPA)[:]?\s*([\d.]+).*?'
|
333 |
+
r'(?:Total\s*Credits)[:]?\s*([\d.]+).*?'
|
334 |
+
r'(?:Comm\s*Serv|Service\s*Hours)[:]?\s*(\d+)',
|
335 |
+
text, re.IGNORECASE | re.DOTALL
|
336 |
)
|
337 |
|
338 |
if student_match:
|
339 |
self.student_data = {
|
340 |
"id": student_match.group(1).strip(),
|
341 |
+
"name": student_match.group(2).replace(",", ", ").strip().title(),
|
342 |
"current_grade": student_match.group(3),
|
343 |
"graduation_year": student_match.group(4),
|
344 |
"unweighted_gpa": float(student_match.group(5)),
|
|
|
346 |
"total_credits": float(student_match.group(7)),
|
347 |
"community_service_hours": int(student_match.group(8))
|
348 |
}
|
349 |
+
else:
|
350 |
+
# Fallback pattern if first one fails
|
351 |
+
student_match = re.search(
|
352 |
+
r'(\d{7})\s*(.*?)\s*(?:Grade|Grd)[:]?\s*(\d+)',
|
353 |
+
text, re.IGNORECASE
|
354 |
+
)
|
355 |
+
if student_match:
|
356 |
+
self.student_data = {
|
357 |
+
"id": student_match.group(1).strip(),
|
358 |
+
"name": student_match.group(2).strip().title(),
|
359 |
+
"current_grade": student_match.group(3),
|
360 |
+
"graduation_year": "",
|
361 |
+
"unweighted_gpa": 0.0,
|
362 |
+
"weighted_gpa": 0.0,
|
363 |
+
"total_credits": 0.0,
|
364 |
+
"community_service_hours": 0
|
365 |
+
}
|
366 |
|
367 |
# Extract requirements
|
368 |
self.requirements = {}
|
|
|
540 |
|
541 |
return '\n'.join(output)
|
542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
|
544 |
+
"""More robust AI parsing with better error handling"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
545 |
try:
|
546 |
+
text = remove_sensitive_info(text[:20000]) # Increased limit
|
547 |
+
|
548 |
+
# Improved prompt with examples
|
549 |
+
prompt = f"""Extract academic transcript data as JSON. Follow this structure:
|
550 |
+
|
551 |
+
Example Input:
|
552 |
+
Student ID: 1234567 Name: DOE, JOHN Current Grade: 12 YOG: 2024
|
553 |
+
Unweighted GPA: 3.5 Weighted GPA: 4.2 Total Credits: 24.5
|
554 |
+
|
555 |
+
Example Output:
|
556 |
+
{{
|
557 |
+
"student_info": {{
|
558 |
+
"name": "John Doe",
|
559 |
+
"id": "1234567",
|
560 |
+
"current_grade": "12",
|
561 |
+
"graduation_year": "2024",
|
562 |
+
"unweighted_gpa": 3.5,
|
563 |
+
"weighted_gpa": 4.2,
|
564 |
+
"total_credits": 24.5
|
565 |
+
}},
|
566 |
+
"course_history": [
|
567 |
+
{{
|
568 |
+
"course_code": "MATH101",
|
569 |
+
"description": "Algebra I",
|
570 |
+
"grade": "A",
|
571 |
+
"credits": 1.0,
|
572 |
+
"school_year": "2022-2023"
|
573 |
+
}}
|
574 |
+
]
|
575 |
+
}}
|
576 |
+
|
577 |
+
Actual Transcript:
|
578 |
+
{text}
|
579 |
+
"""
|
580 |
+
|
581 |
if progress:
|
582 |
+
progress(0.3, desc="Processing with AI...")
|
583 |
|
584 |
model, tokenizer = get_model_and_tokenizer()
|
585 |
+
if model is None:
|
586 |
+
raise ValueError("Model not loaded")
|
587 |
+
|
588 |
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model_loader.device)
|
|
|
|
|
|
|
589 |
|
590 |
outputs = model.generate(
|
591 |
**inputs,
|
592 |
+
max_new_tokens=2500,
|
593 |
+
temperature=0.3, # Lower for more consistent results
|
594 |
do_sample=True,
|
595 |
top_p=0.9,
|
596 |
+
repetition_penalty=1.2
|
597 |
)
|
|
|
|
|
598 |
|
|
|
599 |
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
600 |
|
601 |
+
# More robust JSON extraction
|
602 |
try:
|
603 |
+
if '```json' in response:
|
604 |
+
json_str = response.split('```json')[1].split('```')[0].strip()
|
|
|
|
|
|
|
|
|
|
|
605 |
else:
|
606 |
+
json_str = response.split('{', 1)[1].rsplit('}', 1)[0]
|
607 |
+
json_str = '{' + json_str + '}'
|
608 |
+
|
609 |
+
parsed_data = json.loads(json_str)
|
610 |
+
|
611 |
+
# Validate required fields
|
612 |
+
if not all(k in parsed_data for k in ["student_info", "course_history"]):
|
613 |
+
raise ValueError("Missing required fields in AI response")
|
614 |
+
|
615 |
+
return parsed_data
|
616 |
+
|
617 |
+
except Exception as e:
|
618 |
+
logging.error(f"JSON parsing failed: {str(e)}")
|
619 |
+
raise ValueError(f"AI returned invalid format. Please try again.")
|
620 |
+
|
621 |
+
except Exception as e:
|
622 |
+
logging.error(f"AI parsing error: {str(e)}")
|
623 |
+
raise gr.Error(f"Failed to parse transcript: {str(e)}")
|
624 |
+
|
625 |
+
def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
|
626 |
+
"""Enhanced AI parsing with fallback to structured parsing"""
|
627 |
+
try:
|
628 |
+
# First try structured parsing
|
629 |
+
if progress:
|
630 |
+
progress(0.1, desc="Attempting structured parsing...")
|
631 |
|
632 |
+
parser = TranscriptParser()
|
633 |
+
parsed_data = parser.parse_transcript(text)
|
|
|
|
|
634 |
|
635 |
+
# Validate the parsed data
|
636 |
+
if not validate_parsed_data(parsed_data):
|
637 |
+
raise ValueError("Structured parsing returned incomplete data")
|
638 |
+
|
639 |
if progress:
|
640 |
+
progress(0.8, desc="Formatting results...")
|
641 |
+
|
642 |
return parsed_data
|
643 |
+
|
|
|
|
|
644 |
except Exception as e:
|
645 |
+
logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
|
646 |
+
|
647 |
+
# Fall back to AI parsing if structured parsing fails
|
648 |
+
return parse_transcript_with_ai_fallback(text, progress)
|
649 |
|
650 |
async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
|
651 |
"""Async wrapper for transcript parsing"""
|
|
|
670 |
if not text.strip():
|
671 |
raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
|
672 |
|
673 |
+
# Try structured parsing first
|
674 |
if progress:
|
675 |
+
progress(0.4, desc="Attempting structured parsing...")
|
676 |
+
|
677 |
+
parser = TranscriptParser()
|
678 |
+
try:
|
679 |
+
parsed_data = parser.parse_transcript(text)
|
680 |
+
if validate_parsed_data(parsed_data):
|
681 |
+
if progress:
|
682 |
+
progress(0.9, desc="Formatting results...")
|
683 |
+
return format_transcript_output(parsed_data), parsed_data
|
684 |
+
except Exception as e:
|
685 |
+
logging.warning(f"Structured parsing failed: {str(e)}")
|
686 |
|
687 |
+
# Fall back to AI if structured fails
|
688 |
if progress:
|
689 |
+
progress(0.5, desc="Using AI analysis...")
|
690 |
+
|
691 |
+
parsed_data = parse_transcript_with_ai_fallback(text, progress)
|
692 |
+
return format_transcript_output(parsed_data), parsed_data
|
693 |
|
|
|
|
|
694 |
except Exception as e:
|
695 |
+
error_msg = f"Error processing transcript: {str(e)}"
|
696 |
+
# Add specific troubleshooting tips
|
697 |
if "PDF" in str(e):
|
698 |
+
error_msg += "\n\nTIPS:\n1. Try converting to image (screenshot)\n2. Ensure text is selectable in PDF\n3. Try a different PDF reader"
|
699 |
elif "image" in str(e).lower():
|
700 |
+
error_msg += "\n\nTIPS:\n1. Use high contrast images\n2. Crop to just the transcript\n3. Ensure good lighting"
|
701 |
+
elif "AI" in str(e):
|
702 |
+
error_msg += "\n\nTIPS:\n1. Try a smaller section of the transcript\n2. Check for sensitive info that may be redacted\n3. Try again later"
|
703 |
+
|
704 |
logging.error(error_msg)
|
705 |
return error_msg, None
|
706 |
|
|
|
1499 |
except Exception as e:
|
1500 |
error_msg = f"Error processing transcript: {str(e)}"
|
1501 |
if "PDF" in str(e):
|
1502 |
+
error_msg += "\n\nTIPS:\n- Try converting to image (screenshot)\n- Ensure text is selectable in PDF\n- Try a different PDF reader"
|
1503 |
return (
|
1504 |
error_msg,
|
1505 |
None,
|