Spaces:
Sleeping
Sleeping
File size: 922 Bytes
5581268 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
import re
import pdf2image
import pytesseract
from pathlib import Path
import os
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
def extract_pdf_text(pdf_file):
if pdf_file is None:
return None
try:
if isinstance(pdf_file, (str, Path)):
pdf_bytes = Path(pdf_file).read_bytes()
elif hasattr(pdf_file, "read"):
pdf_bytes = pdf_file.read()
else:
return None
images = pdf2image.convert_from_bytes(pdf_bytes)
all_text = [pytesseract.image_to_string(img) for img in images]
combined_text = "\n".join(all_text)
cleaned = re.sub(r'\s+', ' ', combined_text)
cleaned = re.sub(r'[^\w\s.,&%]', '', cleaned)
return cleaned.strip()
except Exception as e:
print(f"❌ Error during OCR: {e}")
return None |