Dannyar608 commited on
Commit
55e2010
·
verified ·
1 Parent(s): d3a1938

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -184
app.py CHANGED
@@ -36,9 +36,9 @@ SESSION_TIMEOUT = 3600 # 1 hour session timeout
36
 
37
  # Initialize logging
38
  logging.basicConfig(
39
- filename='app.log',
40
- level=logging.INFO,
41
- format='%(asctime)s - %(levelname)s - %(message)s'
42
  )
43
 
44
  # Model configuration - Only DeepSeek
@@ -72,67 +72,61 @@ class ModelLoader:
72
 
73
  def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
74
  """Lazy load the model with progress feedback"""
75
- if self.loaded:
76
- return self.model, self.tokenizer
77
-
78
- self.loading = True
79
- self.error = None
80
-
81
  try:
82
  if progress:
83
- progress(0.1, desc="Initializing...")
84
 
85
- # Clear previous model if any
86
- if self.model:
87
- del self.model
88
- del self.tokenizer
89
- torch.cuda.empty_cache()
90
- time.sleep(2) # Allow CUDA cleanup
91
-
92
- # Load with optimized settings
93
- model_kwargs = {
94
- "trust_remote_code": True,
95
- "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
96
- "device_map": "auto" if self.device == "cuda" else None,
97
- "low_cpu_mem_usage": True
98
- }
99
 
100
  if progress:
101
- progress(0.3, desc="Loading tokenizer...")
102
- self.tokenizer = AutoTokenizer.from_pretrained(
 
103
  MODEL_NAME,
104
  trust_remote_code=True
105
  )
106
 
107
  if progress:
108
- progress(0.6, desc="Loading model...")
109
- self.model = AutoModelForCausalLM.from_pretrained(
110
- MODEL_NAME,
111
- **model_kwargs
112
- ).to(self.device)
 
 
 
 
 
113
 
114
- # Verify model responsiveness
115
- if progress:
116
- progress(0.8, desc="Verifying model...")
117
- test_input = self.tokenizer("Test", return_tensors="pt").to(self.device)
118
- _ = self.model.generate(**test_input, max_new_tokens=1)
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- self.model.eval() # Disable dropout
121
- if progress:
122
- progress(0.9, desc="Finalizing...")
123
  self.loaded = True
124
- return self.model, self.tokenizer
125
 
126
- except torch.cuda.OutOfMemoryError:
127
- self.error = "Out of GPU memory. Try using CPU instead."
128
- logging.error(self.error)
129
- return None, None
130
  except Exception as e:
131
- self.error = f"Model loading error: {str(e)}"
132
  logging.error(self.error)
133
  return None, None
134
- finally:
135
- self.loading = False
136
 
137
  # Initialize model loader
138
  model_loader = ModelLoader()
@@ -285,6 +279,22 @@ def remove_sensitive_info(text: str) -> str:
285
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
286
  return text
287
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  # ========== TRANSCRIPT PARSING ==========
289
  class TranscriptParser:
290
  def __init__(self):
@@ -298,7 +308,7 @@ class TranscriptParser:
298
  """Parse Miami-Dade formatted transcripts with updated regex patterns."""
299
  try:
300
  # First try structured parsing for Miami-Dade format
301
- if "Graduation Progress Summary" in text and "Miami-Dade" in text:
302
  return self._parse_miami_dade_format(text)
303
  else:
304
  # Fall back to AI parsing if not Miami-Dade format
@@ -309,17 +319,26 @@ class TranscriptParser:
309
  raise ValueError(f"Couldn't parse transcript: {str(e)}")
310
 
311
  def _parse_miami_dade_format(self, text: str) -> Dict:
312
- """Specialized parser for Miami-Dade County Public Schools transcripts."""
313
- # Extract student info
 
 
 
314
  student_match = re.search(
315
- r"(\d{7})\s*-\s*([A-Z\s,]+).*?Current Grade:\s*(\d+).*?YOG\s*(\d{4}).*?Un-weighted GPA\s*([\d.]+).*?Weighted GPA\s*([\d.]+).*?Total Credits Earned\s*([\d.]+).*?Comm Serv Hours\s*(\d+)",
316
- text, re.DOTALL
 
 
 
 
 
 
317
  )
318
 
319
  if student_match:
320
  self.student_data = {
321
  "id": student_match.group(1).strip(),
322
- "name": student_match.group(2).replace(",", ", ").strip(),
323
  "current_grade": student_match.group(3),
324
  "graduation_year": student_match.group(4),
325
  "unweighted_gpa": float(student_match.group(5)),
@@ -327,6 +346,23 @@ class TranscriptParser:
327
  "total_credits": float(student_match.group(7)),
328
  "community_service_hours": int(student_match.group(8))
329
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  # Extract requirements
332
  self.requirements = {}
@@ -504,143 +540,112 @@ def format_transcript_output(data: Dict) -> str:
504
 
505
  return '\n'.join(output)
506
 
507
- def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
508
- """Enhanced AI parsing with fallback to structured parsing"""
509
- try:
510
- # First try structured parsing
511
- if progress:
512
- progress(0.1, desc="Attempting structured parsing...")
513
-
514
- parser = TranscriptParser()
515
- parsed_data = parser.parse_transcript(text)
516
-
517
- if progress:
518
- progress(0.8, desc="Formatting results...")
519
-
520
- return parsed_data
521
-
522
- except Exception as e:
523
- logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
524
-
525
- # Fall back to AI parsing if structured parsing fails
526
- return parse_transcript_with_ai_fallback(text, progress)
527
-
528
  def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
529
- """Fallback AI parsing method with improved prompt engineering"""
530
- # Pre-process the text
531
- text = remove_sensitive_info(text[:15000]) # Limit input size
532
-
533
- prompt = f"""
534
- Analyze this academic transcript and extract structured information in JSON format. Follow this exact structure:
535
-
536
- {{
537
- "student_info": {{
538
- "name": "Full Name",
539
- "id": "Student ID",
540
- "current_grade": "Grade Level",
541
- "graduation_year": "Year of Graduation",
542
- "unweighted_gpa": 0.0,
543
- "weighted_gpa": 0.0,
544
- "total_credits": 0.0,
545
- "community_service_hours": 0
546
- }},
547
- "requirements": {{
548
- "A-English": {{
549
- "description": "English requirement description",
550
- "required": 4.0,
551
- "completed": 4.0,
552
- "status": "100%"
553
- }}
554
- }},
555
- "current_courses": [
556
- {{
557
- "course": "Course Name",
558
- "code": "Course Code",
559
- "category": "Requirement Category",
560
- "term": "Term",
561
- "credits": "inProgress or credit value",
562
- "grade_level": "Grade Level"
563
- }}
564
- ],
565
- "course_history": [
566
- {{
567
- "requirement_category": "Category Code",
568
- "school_year": "Year Taken",
569
- "grade_level": "Grade Level",
570
- "course_code": "Course Code",
571
- "description": "Course Description",
572
- "term": "Term",
573
- "grade": "Grade Received",
574
- "credits": "Credits Earned"
575
- }}
576
- ],
577
- "graduation_status": {{
578
- "total_required_credits": 24.0,
579
- "total_completed_credits": 24.0,
580
- "percent_complete": 100.0,
581
- "remaining_credits": 0.0,
582
- "on_track": true
583
- }},
584
- "format": "miami_dade or standard"
585
- }}
586
-
587
- Transcript Text:
588
- {text}
589
- """
590
-
591
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
592
  if progress:
593
- progress(0.1, desc="Processing transcript with AI...")
594
 
595
  model, tokenizer = get_model_and_tokenizer()
596
- if model is None or tokenizer is None:
597
- raise gr.Error(f"Model failed to load. {model_loader.error or 'Please try loading a model first.'}")
598
-
599
- # Tokenize and generate response
600
- inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True).to(model_loader.device)
601
- if progress:
602
- progress(0.4)
603
 
604
  outputs = model.generate(
605
  **inputs,
606
- max_new_tokens=2000,
607
- temperature=0.1,
608
  do_sample=True,
609
  top_p=0.9,
610
- repetition_penalty=1.1
611
  )
612
- if progress:
613
- progress(0.8)
614
 
615
- # Decode the response
616
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
617
 
618
- # Extract JSON from response
619
  try:
620
- json_str = response.split('```json')[1].split('```')[0].strip()
621
- parsed_data = json.loads(json_str)
622
- except (IndexError, json.JSONDecodeError):
623
- # Fallback: Try to find JSON in the response
624
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
625
- if json_match:
626
- parsed_data = json.loads(json_match.group())
627
  else:
628
- raise ValueError("Could not extract JSON from AI response")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
 
630
- # Validate the parsed data structure
631
- required_keys = ["student_info", "requirements", "course_history"]
632
- if not all(key in parsed_data for key in required_keys):
633
- raise ValueError("AI returned incomplete data structure")
634
 
 
 
 
 
635
  if progress:
636
- progress(1.0)
 
637
  return parsed_data
638
-
639
- except torch.cuda.OutOfMemoryError:
640
- raise gr.Error("The model ran out of memory. Try with a smaller transcript.")
641
  except Exception as e:
642
- logging.error(f"AI parsing error: {str(e)}")
643
- raise gr.Error(f"Error processing transcript: {str(e)}\n\nPlease try again or contact support with this error message.")
 
 
644
 
645
  async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
646
  """Async wrapper for transcript parsing"""
@@ -665,27 +670,37 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
665
  if not text.strip():
666
  raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
667
 
668
- # Use AI for parsing with progress updates
669
  if progress:
670
- progress(0.4, desc="Analyzing transcript content...")
671
-
672
- parsed_data = parse_transcript_with_ai(text, progress)
 
 
 
 
 
 
 
 
673
 
674
- # Format output text
675
  if progress:
676
- progress(0.9, desc="Generating report...")
677
-
678
- output_text = format_transcript_output(parsed_data)
 
679
 
680
- return output_text, parsed_data
681
-
682
  except Exception as e:
683
- error_msg = f"Error processing transcript: {str(e)}"
 
684
  if "PDF" in str(e):
685
- error_msg += "\n\nTIPS FOR PDF FILES:\n1. Try opening and re-saving the PDF in a different format\n2. Ensure the PDF isn't password protected\n3. Try taking a screenshot of the transcript and uploading as an image"
686
  elif "image" in str(e).lower():
687
- error_msg += "\n\nTIPS FOR IMAGE FILES:\n1. Ensure the image is clear and well-lit\n2. Try cropping to just the transcript area\n3. Avoid blurry or low-resolution images"
688
-
 
 
689
  logging.error(error_msg)
690
  return error_msg, None
691
 
@@ -1484,7 +1499,7 @@ def create_interface():
1484
  except Exception as e:
1485
  error_msg = f"Error processing transcript: {str(e)}"
1486
  if "PDF" in str(e):
1487
- error_msg += "\n\nTIPS:\n- Try re-saving the PDF\n- Ensure it's not password protected\n- Try converting to an image"
1488
  return (
1489
  error_msg,
1490
  None,
 
36
 
37
  # Initialize logging
38
  logging.basicConfig(
39
+ filename='transcript_parser.log',
40
+ level=logging.DEBUG,
41
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
42
  )
43
 
44
  # Model configuration - Only DeepSeek
 
72
 
73
  def load_model(self, progress: gr.Progress = None) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
74
  """Lazy load the model with progress feedback"""
 
 
 
 
 
 
75
  try:
76
  if progress:
77
+ progress(0.1, desc="Checking GPU availability...")
78
 
79
+ # Clear CUDA cache first
80
+ torch.cuda.empty_cache()
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  if progress:
83
+ progress(0.2, desc="Loading tokenizer...")
84
+
85
+ tokenizer = AutoTokenizer.from_pretrained(
86
  MODEL_NAME,
87
  trust_remote_code=True
88
  )
89
 
90
  if progress:
91
+ progress(0.5, desc="Loading model (this may take a few minutes)...")
92
+
93
+ # More robust model loading
94
+ model_kwargs = {
95
+ "trust_remote_code": True,
96
+ "torch_dtype": torch.float16 if self.device == "cuda" else torch.float32,
97
+ "device_map": "auto" if self.device == "cuda" else None,
98
+ "low_cpu_mem_usage": True,
99
+ "offload_folder": "offload" # For handling large models
100
+ }
101
 
102
+ try:
103
+ model = AutoModelForCausalLM.from_pretrained(
104
+ MODEL_NAME,
105
+ **model_kwargs
106
+ )
107
+ except torch.cuda.OutOfMemoryError:
108
+ # Fallback to CPU if GPU OOM
109
+ model_kwargs["device_map"] = None
110
+ model = AutoModelForCausalLM.from_pretrained(
111
+ MODEL_NAME,
112
+ **model_kwargs
113
+ ).to('cpu')
114
+ self.device = 'cpu'
115
+
116
+ # Verify model is responsive
117
+ test_input = tokenizer("Test", return_tensors="pt").to(self.device)
118
+ _ = model.generate(**test_input, max_new_tokens=1)
119
 
120
+ self.model = model.eval()
121
+ self.tokenizer = tokenizer
 
122
  self.loaded = True
 
123
 
124
+ return model, tokenizer
125
+
 
 
126
  except Exception as e:
127
+ self.error = f"Model loading failed: {str(e)}"
128
  logging.error(self.error)
129
  return None, None
 
 
130
 
131
  # Initialize model loader
132
  model_loader = ModelLoader()
 
279
  text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
280
  return text
281
 
282
+ def validate_parsed_data(data: Dict) -> bool:
283
+ """Validate the structure of parsed transcript data"""
284
+ required_student_fields = ['name', 'current_grade']
285
+ required_course_fields = ['description', 'credits']
286
+
287
+ if 'student_info' not in data:
288
+ return False
289
+ if not all(field in data['student_info'] for field in required_student_fields):
290
+ return False
291
+ if 'course_history' not in data or not isinstance(data['course_history'], list):
292
+ return False
293
+ if len(data['course_history']) > 0:
294
+ if not all(field in data['course_history'][0] for field in required_course_fields):
295
+ return False
296
+ return True
297
+
298
  # ========== TRANSCRIPT PARSING ==========
299
  class TranscriptParser:
300
  def __init__(self):
 
308
  """Parse Miami-Dade formatted transcripts with updated regex patterns."""
309
  try:
310
  # First try structured parsing for Miami-Dade format
311
+ if "Graduation Progress Summary" in text or "Miami-Dade" in text:
312
  return self._parse_miami_dade_format(text)
313
  else:
314
  # Fall back to AI parsing if not Miami-Dade format
 
319
  raise ValueError(f"Couldn't parse transcript: {str(e)}")
320
 
321
  def _parse_miami_dade_format(self, text: str) -> Dict:
322
+ """More flexible parser for Miami-Dade County Public Schools transcripts."""
323
+ # Normalize text first
324
+ text = re.sub(r'\s+', ' ', text) # Collapse multiple spaces
325
+
326
+ # More flexible student info extraction
327
  student_match = re.search(
328
+ r'(?:Student\s*ID[:]?\s*(\d+).*?Name[:]?\s*([A-Za-z\s,]+).*?'
329
+ r'(?:Grade|Level)[:]?\s*(\d+).*?'
330
+ r'(?:Grad|YOG)[:]?\s*(\d{4}).*?'
331
+ r'(?:Unweighted\s*GPA)[:]?\s*([\d.]+).*?'
332
+ r'(?:Weighted\s*GPA)[:]?\s*([\d.]+).*?'
333
+ r'(?:Total\s*Credits)[:]?\s*([\d.]+).*?'
334
+ r'(?:Comm\s*Serv|Service\s*Hours)[:]?\s*(\d+)',
335
+ text, re.IGNORECASE | re.DOTALL
336
  )
337
 
338
  if student_match:
339
  self.student_data = {
340
  "id": student_match.group(1).strip(),
341
+ "name": student_match.group(2).replace(",", ", ").strip().title(),
342
  "current_grade": student_match.group(3),
343
  "graduation_year": student_match.group(4),
344
  "unweighted_gpa": float(student_match.group(5)),
 
346
  "total_credits": float(student_match.group(7)),
347
  "community_service_hours": int(student_match.group(8))
348
  }
349
+ else:
350
+ # Fallback pattern if first one fails
351
+ student_match = re.search(
352
+ r'(\d{7})\s*(.*?)\s*(?:Grade|Grd)[:]?\s*(\d+)',
353
+ text, re.IGNORECASE
354
+ )
355
+ if student_match:
356
+ self.student_data = {
357
+ "id": student_match.group(1).strip(),
358
+ "name": student_match.group(2).strip().title(),
359
+ "current_grade": student_match.group(3),
360
+ "graduation_year": "",
361
+ "unweighted_gpa": 0.0,
362
+ "weighted_gpa": 0.0,
363
+ "total_credits": 0.0,
364
+ "community_service_hours": 0
365
+ }
366
 
367
  # Extract requirements
368
  self.requirements = {}
 
540
 
541
  return '\n'.join(output)
542
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  def parse_transcript_with_ai_fallback(text: str, progress=gr.Progress()) -> Dict:
544
+ """More robust AI parsing with better error handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
545
  try:
546
+ text = remove_sensitive_info(text[:20000]) # Increased limit
547
+
548
+ # Improved prompt with examples
549
+ prompt = f"""Extract academic transcript data as JSON. Follow this structure:
550
+
551
+ Example Input:
552
+ Student ID: 1234567 Name: DOE, JOHN Current Grade: 12 YOG: 2024
553
+ Unweighted GPA: 3.5 Weighted GPA: 4.2 Total Credits: 24.5
554
+
555
+ Example Output:
556
+ {{
557
+ "student_info": {{
558
+ "name": "John Doe",
559
+ "id": "1234567",
560
+ "current_grade": "12",
561
+ "graduation_year": "2024",
562
+ "unweighted_gpa": 3.5,
563
+ "weighted_gpa": 4.2,
564
+ "total_credits": 24.5
565
+ }},
566
+ "course_history": [
567
+ {{
568
+ "course_code": "MATH101",
569
+ "description": "Algebra I",
570
+ "grade": "A",
571
+ "credits": 1.0,
572
+ "school_year": "2022-2023"
573
+ }}
574
+ ]
575
+ }}
576
+
577
+ Actual Transcript:
578
+ {text}
579
+ """
580
+
581
  if progress:
582
+ progress(0.3, desc="Processing with AI...")
583
 
584
  model, tokenizer = get_model_and_tokenizer()
585
+ if model is None:
586
+ raise ValueError("Model not loaded")
587
+
588
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(model_loader.device)
 
 
 
589
 
590
  outputs = model.generate(
591
  **inputs,
592
+ max_new_tokens=2500,
593
+ temperature=0.3, # Lower for more consistent results
594
  do_sample=True,
595
  top_p=0.9,
596
+ repetition_penalty=1.2
597
  )
 
 
598
 
 
599
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
600
 
601
+ # More robust JSON extraction
602
  try:
603
+ if '```json' in response:
604
+ json_str = response.split('```json')[1].split('```')[0].strip()
 
 
 
 
 
605
  else:
606
+ json_str = response.split('{', 1)[1].rsplit('}', 1)[0]
607
+ json_str = '{' + json_str + '}'
608
+
609
+ parsed_data = json.loads(json_str)
610
+
611
+ # Validate required fields
612
+ if not all(k in parsed_data for k in ["student_info", "course_history"]):
613
+ raise ValueError("Missing required fields in AI response")
614
+
615
+ return parsed_data
616
+
617
+ except Exception as e:
618
+ logging.error(f"JSON parsing failed: {str(e)}")
619
+ raise ValueError(f"AI returned invalid format. Please try again.")
620
+
621
+ except Exception as e:
622
+ logging.error(f"AI parsing error: {str(e)}")
623
+ raise gr.Error(f"Failed to parse transcript: {str(e)}")
624
+
625
+ def parse_transcript_with_ai(text: str, progress=gr.Progress()) -> Dict:
626
+ """Enhanced AI parsing with fallback to structured parsing"""
627
+ try:
628
+ # First try structured parsing
629
+ if progress:
630
+ progress(0.1, desc="Attempting structured parsing...")
631
 
632
+ parser = TranscriptParser()
633
+ parsed_data = parser.parse_transcript(text)
 
 
634
 
635
+ # Validate the parsed data
636
+ if not validate_parsed_data(parsed_data):
637
+ raise ValueError("Structured parsing returned incomplete data")
638
+
639
  if progress:
640
+ progress(0.8, desc="Formatting results...")
641
+
642
  return parsed_data
643
+
 
 
644
  except Exception as e:
645
+ logging.warning(f"Structured parsing failed, falling back to AI: {str(e)}")
646
+
647
+ # Fall back to AI parsing if structured parsing fails
648
+ return parse_transcript_with_ai_fallback(text, progress)
649
 
650
  async def parse_transcript_async(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Dict]]:
651
  """Async wrapper for transcript parsing"""
 
670
  if not text.strip():
671
  raise ValueError("No text could be extracted from the file. The file may be corrupted or in an unsupported format.")
672
 
673
+ # Try structured parsing first
674
  if progress:
675
+ progress(0.4, desc="Attempting structured parsing...")
676
+
677
+ parser = TranscriptParser()
678
+ try:
679
+ parsed_data = parser.parse_transcript(text)
680
+ if validate_parsed_data(parsed_data):
681
+ if progress:
682
+ progress(0.9, desc="Formatting results...")
683
+ return format_transcript_output(parsed_data), parsed_data
684
+ except Exception as e:
685
+ logging.warning(f"Structured parsing failed: {str(e)}")
686
 
687
+ # Fall back to AI if structured fails
688
  if progress:
689
+ progress(0.5, desc="Using AI analysis...")
690
+
691
+ parsed_data = parse_transcript_with_ai_fallback(text, progress)
692
+ return format_transcript_output(parsed_data), parsed_data
693
 
 
 
694
  except Exception as e:
695
+ error_msg = f"Error processing transcript: {str(e)}"
696
+ # Add specific troubleshooting tips
697
  if "PDF" in str(e):
698
+ error_msg += "\n\nTIPS:\n1. Try converting to image (screenshot)\n2. Ensure text is selectable in PDF\n3. Try a different PDF reader"
699
  elif "image" in str(e).lower():
700
+ error_msg += "\n\nTIPS:\n1. Use high contrast images\n2. Crop to just the transcript\n3. Ensure good lighting"
701
+ elif "AI" in str(e):
702
+ error_msg += "\n\nTIPS:\n1. Try a smaller section of the transcript\n2. Check for sensitive info that may be redacted\n3. Try again later"
703
+
704
  logging.error(error_msg)
705
  return error_msg, None
706
 
 
1499
  except Exception as e:
1500
  error_msg = f"Error processing transcript: {str(e)}"
1501
  if "PDF" in str(e):
1502
+ error_msg += "\n\nTIPS:\n- Try converting to image (screenshot)\n- Ensure text is selectable in PDF\n- Try a different PDF reader"
1503
  return (
1504
  error_msg,
1505
  None,