Dannyar608 commited on
Commit
a0e5ea9
·
verified ·
1 Parent(s): 5c437e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -15
app.py CHANGED
@@ -175,14 +175,21 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
175
  try:
176
  if file_ext == '.pdf':
177
  try:
 
 
 
 
 
 
 
 
 
178
  doc = fitz.open(file_path)
179
  for page in doc:
180
  text += page.get_text("text") + '\n'
181
  if not text.strip():
182
- raise ValueError("PyMuPDF returned empty text")
183
- except Exception as e:
184
- logging.warning(f"PyMuPDF failed: {str(e)}. Trying OCR fallback...")
185
- text = extract_text_from_pdf_with_ocr(file_path)
186
 
187
  elif file_ext in ['.png', '.jpg', '.jpeg']:
188
  text = extract_text_with_ocr(file_path)
@@ -201,13 +208,22 @@ def extract_text_from_file(file_path: str, file_ext: str) -> str:
201
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
202
  text = ""
203
  try:
204
- doc = fitz.open(file_path)
205
- for page in doc:
206
- pix = page.get_pixmap()
207
- img = Image.open(io.BytesIO(pix.tobytes()))
208
- img = img.convert('L')
209
- img = img.point(lambda x: 0 if x < 128 else 255)
210
- text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
 
 
 
 
 
 
 
 
 
211
  except Exception as e:
212
  raise ValueError(f"PDF OCR failed: {str(e)}")
213
  return text
@@ -434,18 +450,23 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
434
  if progress:
435
  progress(0.2, desc="Extracting text from file...")
436
 
437
- text = extract_text_from_file(file_obj.name, file_ext)
 
 
 
438
 
439
  if not text.strip():
440
- raise ValueError("No text could be extracted from the file.")
441
 
442
  if progress:
443
  progress(0.5, desc="Parsing transcript...")
444
 
445
  parser = TranscriptParser()
446
- parsed_data = parser.parse_transcript(text)
 
 
 
447
 
448
- # Return simple confirmation message
449
  confirmation = "Transcript processed successfully."
450
  if 'gpa' in parsed_data.get('student_info', {}):
451
  confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
@@ -455,6 +476,7 @@ def parse_transcript(file_obj, progress=gr.Progress()) -> Tuple[str, Optional[Di
455
  except Exception as e:
456
  error_msg = f"Error processing transcript: {str(e)}"
457
  logging.error(error_msg)
 
458
  return error_msg, None
459
 
460
  # ========== LEARNING STYLE QUIZ ==========
 
175
  try:
176
  if file_ext == '.pdf':
177
  try:
178
+ # First try pdfplumber for better table extraction
179
+ import pdfplumber
180
+ with pdfplumber.open(file_path) as pdf:
181
+ for page in pdf.pages:
182
+ text += page.extract_text() + '\n'
183
+ if not text.strip():
184
+ raise ValueError("PDFPlumber returned empty text")
185
+ except Exception as e:
186
+ logging.warning(f"PDFPlumber failed: {str(e)}. Trying PyMuPDF...")
187
  doc = fitz.open(file_path)
188
  for page in doc:
189
  text += page.get_text("text") + '\n'
190
  if not text.strip():
191
+ raise ValueError("PyMuPDF returned empty text, trying OCR fallback...")
192
+ text = extract_text_from_pdf_with_ocr(file_path)
 
 
193
 
194
  elif file_ext in ['.png', '.jpg', '.jpeg']:
195
  text = extract_text_with_ocr(file_path)
 
208
  def extract_text_from_pdf_with_ocr(file_path: str) -> str:
209
  text = ""
210
  try:
211
+ # Try pdf2image first if available
212
+ try:
213
+ from pdf2image import convert_from_path
214
+ images = convert_from_path(file_path)
215
+ for img in images:
216
+ text += pytesseract.image_to_string(img) + '\n'
217
+ return text
218
+ except ImportError:
219
+ # Fallback to PyMuPDF for image extraction
220
+ doc = fitz.open(file_path)
221
+ for page in doc:
222
+ pix = page.get_pixmap()
223
+ img = Image.open(io.BytesIO(pix.tobytes()))
224
+ img = img.convert('L')
225
+ img = img.point(lambda x: 0 if x < 128 else 255)
226
+ text += pytesseract.image_to_string(img, config='--psm 6 --oem 3') + '\n'
227
  except Exception as e:
228
  raise ValueError(f"PDF OCR failed: {str(e)}")
229
  return text
 
450
  if progress:
451
  progress(0.2, desc="Extracting text from file...")
452
 
453
+ try:
454
+ text = extract_text_from_file(file_obj.name, file_ext)
455
+ except Exception as e:
456
+ raise ValueError(f"Failed to extract text: {str(e)}. The file may be corrupted or in an unsupported format.")
457
 
458
  if not text.strip():
459
+ raise ValueError("The file appears to be empty or contains no readable text.")
460
 
461
  if progress:
462
  progress(0.5, desc="Parsing transcript...")
463
 
464
  parser = TranscriptParser()
465
+ try:
466
+ parsed_data = parser.parse_transcript(text)
467
+ except Exception as e:
468
+ raise ValueError(f"Couldn't parse transcript content. Error: {str(e)}")
469
 
 
470
  confirmation = "Transcript processed successfully."
471
  if 'gpa' in parsed_data.get('student_info', {}):
472
  confirmation += f"\nGPA detected: {parsed_data['student_info']['gpa']}"
 
476
  except Exception as e:
477
  error_msg = f"Error processing transcript: {str(e)}"
478
  logging.error(error_msg)
479
+ # Return more detailed error to user
480
  return error_msg, None
481
 
482
  # ========== LEARNING STYLE QUIZ ==========