Spaces:

mgbam
/

builder

Running

File size: 4,345 Bytes

1ae58ff

# /extractor.py

"""
Handles content extraction from various sources like files, images, and websites.

This module encapsulates the logic for parsing different file formats (PDF, DOCX),
performing Optical Character Recognition (OCR) on images, and scraping web content.
"""
import mimetypes
import os
import re
from urllib.parse import urlparse, urljoin
import logging

import PyPDF2
import docx
import requests
from bs4 import BeautifulSoup

# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# --- Optional OCR Imports ---
try:
    import cv2
    import numpy as np
    import pytesseract
    OCR_AVAILABLE = True
except ImportError:
    OCR_AVAILABLE = False
    logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")

def extract_text_from_image(image_path: str) -> str:
    """Extracts text from an image file using Tesseract OCR."""
    if not OCR_AVAILABLE:
        return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
    try:
        pytesseract.get_tesseract_version()
    except Exception:
        return "Error: Tesseract OCR is not installed or not in your PATH."

    try:
        image = cv2.imread(image_path)
        if image is None:
            return "Error: Could not read image file."
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        text = pytesseract.image_to_string(gray)
        return text.strip() or "No text found in image."
    except Exception as e:
        logging.error(f"OCR extraction failed: {e}")
        return f"Error during OCR: {e}"

def extract_text_from_file(file_path: str) -> str:
    """Extracts text from a variety of file types."""
    if not file_path:
        return ""
    ext = os.path.splitext(file_path)[1].lower()
    try:
        if ext == ".pdf":
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                return "\n".join(page.extract_text() or "" for page in reader.pages)
        elif ext == ".docx":
            doc = docx.Document(file_path)
            return "\n".join(p.text for p in doc.paragraphs)
        elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
            return extract_text_from_image(file_path)
        else:
            return f"Unsupported file type: {ext}"
    except Exception as e:
        logging.error(f"Error extracting text from {file_path}: {e}")
        return f"Error extracting text: {e}"

def extract_website_content(url: str) -> str:
    """Scrapes and returns the primary HTML content of a given URL."""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, 'html.parser')

        # Make all resource links absolute
        for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
            for item in soup.find_all(tag):
                if item.has_attr(attr):
                    item[attr] = urljoin(url, item[attr])

        title = soup.title.string if soup.title else "N/A"
        # Return a prettified version of the body content for context
        body_content = soup.body.prettify() if soup.body else str(soup)

        # Truncate for prompt
        if len(body_content) > 15000:
             body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"

        return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"

    except requests.RequestException as e:
        logging.error(f"Website extraction failed for {url}: {e}")
        return f"Error: Could not fetch content from the URL. Details: {e}"
    except Exception as e:
        logging.error(f"An unexpected error occurred during website extraction: {e}")
        return f"Error: An unexpected error occurred. Details: {e}"