|
|
|
""" Handles content extraction from various sources like files, images, and websites. """ |
|
import mimetypes, os, re, logging |
|
from urllib.parse import urljoin |
|
import PyPDF2, docx, requests |
|
from bs4 import BeautifulSoup |
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
try: |
|
import cv2, pytesseract |
|
OCR_AVAILABLE = True |
|
except ImportError: |
|
OCR_AVAILABLE = False |
|
logging.warning("OCR libraries not found. Text extraction from images will be disabled.") |
|
|
|
def extract_text_from_image(image_path: str) -> str: |
|
if not OCR_AVAILABLE: return "Error: OCR dependencies not installed." |
|
try: |
|
image = cv2.imread(image_path) |
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
return pytesseract.image_to_string(gray) or "No text found in image." |
|
except Exception as e: return f"Error during OCR: {e}" |
|
|
|
def extract_text_from_file(file_path: str) -> str: |
|
if not file_path: return "" |
|
ext = os.path.splitext(file_path)[1].lower() |
|
try: |
|
if ext == ".pdf": |
|
with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages) |
|
elif ext == ".docx": |
|
return "\n".join(p.text for p in docx.Document(file_path).paragraphs) |
|
elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]: |
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read() |
|
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: |
|
return extract_text_from_image(file_path) |
|
else: return f"Unsupported file type: {ext}" |
|
except Exception as e: return f"Error extracting text: {e}" |
|
|
|
def extract_website_content(url: str) -> str: |
|
try: |
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'} |
|
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) |
|
response.raise_for_status() |
|
response.encoding = response.apparent_encoding |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]: |
|
for item in soup.find_all(tag): |
|
if item.has_attr(attr): item[attr] = urljoin(url, item[attr]) |
|
|
|
body_content = str(soup) |
|
if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->" |
|
return f"<!-- Original URL: {url} -->\n{body_content}" |
|
except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}" |