Spaces:

mgbam
/

builder

Running

App Files Files Community

builder / extractor.py

mgbam

Rename ux_components.py to extractor.py

1ae58ff verified about 1 month ago

raw

history blame

4.35 kB

	# /extractor.py

	"""
	Handles content extraction from various sources like files, images, and websites.

	This module encapsulates the logic for parsing different file formats (PDF, DOCX),
	performing Optical Character Recognition (OCR) on images, and scraping web content.
	"""
	import mimetypes
	import os
	import re
	from urllib.parse import urlparse, urljoin
	import logging

	import PyPDF2
	import docx
	import requests
	from bs4 import BeautifulSoup

	# --- Setup Logging ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	# --- Optional OCR Imports ---
	try:
	import cv2
	import numpy as np
	import pytesseract
	OCR_AVAILABLE = True
	except ImportError:
	OCR_AVAILABLE = False
	logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")

	def extract_text_from_image(image_path: str) -> str:
	"""Extracts text from an image file using Tesseract OCR."""
	if not OCR_AVAILABLE:
	return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
	try:
	pytesseract.get_tesseract_version()
	except Exception:
	return "Error: Tesseract OCR is not installed or not in your PATH."

	try:
	image = cv2.imread(image_path)
	if image is None:
	return "Error: Could not read image file."
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	text = pytesseract.image_to_string(gray)
	return text.strip() or "No text found in image."
	except Exception as e:
	logging.error(f"OCR extraction failed: {e}")
	return f"Error during OCR: {e}"

	def extract_text_from_file(file_path: str) -> str:
	"""Extracts text from a variety of file types."""
	if not file_path:
	return ""
	ext = os.path.splitext(file_path)[1].lower()
	try:
	if ext == ".pdf":
	with open(file_path, "rb") as f:
	reader = PyPDF2.PdfReader(f)
	return "\n".join(page.extract_text() or "" for page in reader.pages)
	elif ext == ".docx":
	doc = docx.Document(file_path)
	return "\n".join(p.text for p in doc.paragraphs)
	elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
	with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()
	elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
	return extract_text_from_image(file_path)
	else:
	return f"Unsupported file type: {ext}"
	except Exception as e:
	logging.error(f"Error extracting text from {file_path}: {e}")
	return f"Error extracting text: {e}"

	def extract_website_content(url: str) -> str:
	"""Scrapes and returns the primary HTML content of a given URL."""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
	response.raise_for_status()
	response.encoding = response.apparent_encoding
	soup = BeautifulSoup(response.text, 'html.parser')

	# Make all resource links absolute
	for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
	for item in soup.find_all(tag):
	if item.has_attr(attr):
	item[attr] = urljoin(url, item[attr])

	title = soup.title.string if soup.title else "N/A"
	# Return a prettified version of the body content for context
	body_content = soup.body.prettify() if soup.body else str(soup)

	# Truncate for prompt
	if len(body_content) > 15000:
	body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"

	return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"

	except requests.RequestException as e:
	logging.error(f"Website extraction failed for {url}: {e}")
	return f"Error: Could not fetch content from the URL. Details: {e}"
	except Exception as e:
	logging.error(f"An unexpected error occurred during website extraction: {e}")
	return f"Error: An unexpected error occurred. Details: {e}"