Spaces:

genaibeauty
/

stock_analysis_rag_project

Runtime error

stock_analysis_rag_project / extract_pdf.py

Rename pdf_processing.py to extract_pdf.py

fe000d0 verified 5 months ago

920 Bytes

	import fitz # PyMuPDF
	import pdfplumber
	import pytesseract
	from PIL import Image
	import io

	def extract_text_from_pdf(pdf_path):
	"""Extract text from normal and scanned PDFs."""
	text = ""

	# Try PyMuPDF first
	doc = fitz.open(pdf_path)
	for page in doc:
	text += page.get_text("text") + "\n"

	# If no text is extracted, use PDFPlumber
	if not text.strip():
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	text += page.extract_text() + "\n"

	return text.strip()

	def extract_text_from_scanned_pdf(pdf_path):
	"""Extract text from scanned PDFs using OCR."""
	doc = fitz.open(pdf_path)
	text = ""

	for page in doc:
	img = page.get_pixmap()
	img_bytes = img.tobytes("png")
	img_pil = Image.open(io.BytesIO(img_bytes))
	text += pytesseract.image_to_string(img_pil) + "\n"

	return text.strip()