Spaces:
Sleeping
Sleeping
import re | |
import pdf2image | |
import pytesseract | |
from pathlib import Path | |
import os | |
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" | |
def extract_pdf_text(pdf_file): | |
if pdf_file is None: | |
return None | |
try: | |
if isinstance(pdf_file, (str, Path)): | |
pdf_bytes = Path(pdf_file).read_bytes() | |
elif hasattr(pdf_file, "read"): | |
pdf_bytes = pdf_file.read() | |
else: | |
return None | |
images = pdf2image.convert_from_bytes(pdf_bytes) | |
all_text = [pytesseract.image_to_string(img) for img in images] | |
combined_text = "\n".join(all_text) | |
cleaned = re.sub(r'\s+', ' ', combined_text) | |
cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned) | |
return cleaned.strip() | |
except Exception as e: | |
print(f"β Error during OCR: {e}") | |
return None |