rfahlevih's picture
Initial Commit
5581268 verified
raw
history blame contribute delete
922 Bytes
import re
import pdf2image
import pytesseract
from pathlib import Path
import os
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def extract_pdf_text(pdf_file):
if pdf_file is None:
return None
try:
if isinstance(pdf_file, (str, Path)):
pdf_bytes = Path(pdf_file).read_bytes()
elif hasattr(pdf_file, "read"):
pdf_bytes = pdf_file.read()
else:
return None
images = pdf2image.convert_from_bytes(pdf_bytes)
all_text = [pytesseract.image_to_string(img) for img in images]
combined_text = "\n".join(all_text)
cleaned = re.sub(r'\s+', ' ', combined_text)
cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned)
return cleaned.strip()
except Exception as e:
print(f"❌ Error during OCR: {e}")
return None