import re import pdf2image import pytesseract from pathlib import Path import os # pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" def extract_pdf_text(pdf_file): if pdf_file is None: return None try: if isinstance(pdf_file, (str, Path)): pdf_bytes = Path(pdf_file).read_bytes() elif hasattr(pdf_file, "read"): pdf_bytes = pdf_file.read() else: return None images = pdf2image.convert_from_bytes(pdf_bytes) all_text = [pytesseract.image_to_string(img) for img in images] combined_text = "\n".join(all_text) cleaned = re.sub(r'\s+', ' ', combined_text) cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned) return cleaned.strip() except Exception as e: print(f"❌ Error during OCR: {e}") return None