Spaces:

genaibeauty
/

stock_analysis_rag_project

Runtime error

File size: 920 Bytes

98d9183

import fitz  # PyMuPDF
import pdfplumber
import pytesseract
from PIL import Image
import io

def extract_text_from_pdf(pdf_path):
    """Extract text from normal and scanned PDFs."""
    text = ""

    # Try PyMuPDF first
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text("text") + "\n"

    # If no text is extracted, use PDFPlumber
    if not text.strip():
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"

    return text.strip()

def extract_text_from_scanned_pdf(pdf_path):
    """Extract text from scanned PDFs using OCR."""
    doc = fitz.open(pdf_path)
    text = ""

    for page in doc:
        img = page.get_pixmap()
        img_bytes = img.tobytes("png")
        img_pil = Image.open(io.BytesIO(img_bytes))
        text += pytesseract.image_to_string(img_pil) + "\n"

    return text.strip()