Spaces:

mgbam
/

builder

Running

App Files Files Community

mgbam commited on Jul 18

Commit

1ae58ff

verified ·

1 Parent(s): 1687ea3

Rename ux_components.py to extractor.py

Browse files

Files changed (2) hide show

extractor.py +109 -0
ux_components.py +0 -24

extractor.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# /extractor.py
+"""
+Handles content extraction from various sources like files, images, and websites.
+This module encapsulates the logic for parsing different file formats (PDF, DOCX),
+performing Optical Character Recognition (OCR) on images, and scraping web content.
+"""
+import mimetypes
+import os
+import re
+from urllib.parse import urlparse, urljoin
+import logging
+import PyPDF2
+import docx
+import requests
+from bs4 import BeautifulSoup
+# --- Setup Logging ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# --- Optional OCR Imports ---
+try:
+    import cv2
+    import numpy as np
+    import pytesseract
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
+    logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
+def extract_text_from_image(image_path: str) -> str:
+    """Extracts text from an image file using Tesseract OCR."""
+    if not OCR_AVAILABLE:
+        return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
+    try:
+        pytesseract.get_tesseract_version()
+    except Exception:
+        return "Error: Tesseract OCR is not installed or not in your PATH."
+    try:
+        image = cv2.imread(image_path)
+        if image is None:
+            return "Error: Could not read image file."
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        text = pytesseract.image_to_string(gray)
+        return text.strip() or "No text found in image."
+    except Exception as e:
+        logging.error(f"OCR extraction failed: {e}")
+        return f"Error during OCR: {e}"
+def extract_text_from_file(file_path: str) -> str:
+    """Extracts text from a variety of file types."""
+    if not file_path:
+        return ""
+    ext = os.path.splitext(file_path)[1].lower()
+    try:
+        if ext == ".pdf":
+            with open(file_path, "rb") as f:
+                reader = PyPDF2.PdfReader(f)
+                return "\n".join(page.extract_text() or "" for page in reader.pages)
+        elif ext == ".docx":
+            doc = docx.Document(file_path)
+            return "\n".join(p.text for p in doc.paragraphs)
+        elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read()
+        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
+            return extract_text_from_image(file_path)
+        else:
+            return f"Unsupported file type: {ext}"
+    except Exception as e:
+        logging.error(f"Error extracting text from {file_path}: {e}")
+        return f"Error extracting text: {e}"
+def extract_website_content(url: str) -> str:
+    """Scrapes and returns the primary HTML content of a given URL."""
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
+        response.raise_for_status()
+        response.encoding = response.apparent_encoding
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Make all resource links absolute
+        for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
+            for item in soup.find_all(tag):
+                if item.has_attr(attr):
+                    item[attr] = urljoin(url, item[attr])
+        title = soup.title.string if soup.title else "N/A"
+        # Return a prettified version of the body content for context
+        body_content = soup.body.prettify() if soup.body else str(soup)
+        # Truncate for prompt
+        if len(body_content) > 15000:
+             body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
+        return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
+    except requests.RequestException as e:
+        logging.error(f"Website extraction failed for {url}: {e}")
+        return f"Error: Could not fetch content from the URL. Details: {e}"
+    except Exception as e:
+        logging.error(f"An unexpected error occurred during website extraction: {e}")
+        return f"Error: An unexpected error occurred. Details: {e}"

ux_components.py DELETED Viewed

@@ -1,24 +0,0 @@
-import gradio as gr
-from config import DEMO_LIST
-def create_top_demo_cards(input_textbox):
-    """Creates a Gradio Column with buttons for the top 3 demo examples."""
-    with gr.Column(visible=True) as quick_examples_col:
-        for i, demo_item in enumerate(DEMO_LIST[:3]):
-            demo_card = gr.Button(
-                value=demo_item['title'],
-                variant="secondary",
-                size="sm",
-                elem_id=f"demo_card_{i}"  # Add an ID for potential styling
-            )
-            demo_card.click(
-                fn=lambda idx=i: gr.update(value=DEMO_LIST[idx]['description']),
-                outputs=input_textbox,
-            )
-    return quick_examples_col
-if __name__ == "__main__":
-    with gr.Blocks() as demo:
-        input_textbox = gr.Textbox(label="Input")
-        create_top_demo_cards(input_textbox)
-    demo.launch()