Spaces:

genaibeauty
/

stock_analysis_rag_project

Runtime error

genaibeauty commited on Jan 30

Commit

98d9183

verified ·

1 Parent(s): 42f6bbe

Create pdf_processing.py

Files changed (1) hide show

pdf_processing.py ADDED Viewed

+import fitz  # PyMuPDF
+import pdfplumber
+import pytesseract
+from PIL import Image
+import io
+def extract_text_from_pdf(pdf_path):
+    """Extract text from normal and scanned PDFs."""
+    text = ""
+    # Try PyMuPDF first
+    doc = fitz.open(pdf_path)
+    for page in doc:
+        text += page.get_text("text") + "\n"
+    # If no text is extracted, use PDFPlumber
+    if not text.strip():
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() + "\n"
+    return text.strip()
+def extract_text_from_scanned_pdf(pdf_path):
+    """Extract text from scanned PDFs using OCR."""
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        img = page.get_pixmap()
+        img_bytes = img.tobytes("png")
+        img_pil = Image.open(io.BytesIO(img_bytes))
+        text += pytesseract.image_to_string(img_pil) + "\n"
+    return text.strip()