|
|
|
|
|
""" |
|
Handles content extraction from various sources like files, images, and websites. |
|
|
|
This module encapsulates the logic for parsing different file formats (PDF, DOCX), |
|
performing Optical Character Recognition (OCR) on images, and scraping web content. |
|
""" |
|
import mimetypes |
|
import os |
|
import re |
|
from urllib.parse import urlparse, urljoin |
|
import logging |
|
|
|
import PyPDF2 |
|
import docx |
|
import requests |
|
from bs4 import BeautifulSoup |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
try: |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
OCR_AVAILABLE = True |
|
except ImportError: |
|
OCR_AVAILABLE = False |
|
logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.") |
|
|
|
def extract_text_from_image(image_path: str) -> str: |
|
"""Extracts text from an image file using Tesseract OCR.""" |
|
if not OCR_AVAILABLE: |
|
return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'." |
|
try: |
|
pytesseract.get_tesseract_version() |
|
except Exception: |
|
return "Error: Tesseract OCR is not installed or not in your PATH." |
|
|
|
try: |
|
image = cv2.imread(image_path) |
|
if image is None: |
|
return "Error: Could not read image file." |
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) |
|
text = pytesseract.image_to_string(gray) |
|
return text.strip() or "No text found in image." |
|
except Exception as e: |
|
logging.error(f"OCR extraction failed: {e}") |
|
return f"Error during OCR: {e}" |
|
|
|
def extract_text_from_file(file_path: str) -> str: |
|
"""Extracts text from a variety of file types.""" |
|
if not file_path: |
|
return "" |
|
ext = os.path.splitext(file_path)[1].lower() |
|
try: |
|
if ext == ".pdf": |
|
with open(file_path, "rb") as f: |
|
reader = PyPDF2.PdfReader(f) |
|
return "\n".join(page.extract_text() or "" for page in reader.pages) |
|
elif ext == ".docx": |
|
doc = docx.Document(file_path) |
|
return "\n".join(p.text for p in doc.paragraphs) |
|
elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]: |
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f: |
|
return f.read() |
|
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]: |
|
return extract_text_from_image(file_path) |
|
else: |
|
return f"Unsupported file type: {ext}" |
|
except Exception as e: |
|
logging.error(f"Error extracting text from {file_path}: {e}") |
|
return f"Error extracting text: {e}" |
|
|
|
def extract_website_content(url: str) -> str: |
|
"""Scrapes and returns the primary HTML content of a given URL.""" |
|
try: |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) |
|
response.raise_for_status() |
|
response.encoding = response.apparent_encoding |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]: |
|
for item in soup.find_all(tag): |
|
if item.has_attr(attr): |
|
item[attr] = urljoin(url, item[attr]) |
|
|
|
title = soup.title.string if soup.title else "N/A" |
|
|
|
body_content = soup.body.prettify() if soup.body else str(soup) |
|
|
|
|
|
if len(body_content) > 15000: |
|
body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->" |
|
|
|
return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}" |
|
|
|
except requests.RequestException as e: |
|
logging.error(f"Website extraction failed for {url}: {e}") |
|
return f"Error: Could not fetch content from the URL. Details: {e}" |
|
except Exception as e: |
|
logging.error(f"An unexpected error occurred during website extraction: {e}") |
|
return f"Error: An unexpected error occurred. Details: {e}" |