# /extractor.py """ Handles content extraction from various sources like files, images, and websites. This module encapsulates the logic for parsing different file formats (PDF, DOCX), performing Optical Character Recognition (OCR) on images, and scraping web content. """ import mimetypes import os import re from urllib.parse import urlparse, urljoin import logging import PyPDF2 import docx import requests from bs4 import BeautifulSoup # --- Setup Logging --- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # --- Optional OCR Imports --- try: import cv2 import numpy as np import pytesseract OCR_AVAILABLE = True except ImportError: OCR_AVAILABLE = False logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.") def extract_text_from_image(image_path: str) -> str: """Extracts text from an image file using Tesseract OCR.""" if not OCR_AVAILABLE: return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'." try: pytesseract.get_tesseract_version() except Exception: return "Error: Tesseract OCR is not installed or not in your PATH." try: image = cv2.imread(image_path) if image is None: return "Error: Could not read image file." gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) text = pytesseract.image_to_string(gray) return text.strip() or "No text found in image." except Exception as e: logging.error(f"OCR extraction failed: {e}") return f"Error during OCR: {e}" def extract_text_from_file(file_path: str) -> str: """Extracts text from a variety of file types.""" if not file_path: return "" ext = os.path.splitext(file_path)[1].lower() try: if ext == ".pdf": with open(file_path, "rb") as f: reader = PyPDF2.PdfReader(f) return "\n".join(page.extract_text() or "" for page in reader.pages) elif ext == ".docx": doc = docx.Document(file_path) return "\n".join(p.text for p in doc.paragraphs) elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]: with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read() elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: return extract_text_from_image(file_path) else: return f"Unsupported file type: {ext}" except Exception as e: logging.error(f"Error extracting text from {file_path}: {e}") return f"Error extracting text: {e}" def extract_website_content(url: str) -> str: """Scrapes and returns the primary HTML content of a given URL.""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) response.raise_for_status() response.encoding = response.apparent_encoding soup = BeautifulSoup(response.text, 'html.parser') # Make all resource links absolute for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]: for item in soup.find_all(tag): if item.has_attr(attr): item[attr] = urljoin(url, item[attr]) title = soup.title.string if soup.title else "N/A" # Return a prettified version of the body content for context body_content = soup.body.prettify() if soup.body else str(soup) # Truncate for prompt if len(body_content) > 15000: body_content = body_content[:15000] + "\n" return f"\n\n{body_content}" except requests.RequestException as e: logging.error(f"Website extraction failed for {url}: {e}") return f"Error: Could not fetch content from the URL. Details: {e}" except Exception as e: logging.error(f"An unexpected error occurred during website extraction: {e}") return f"Error: An unexpected error occurred. Details: {e}"