File size: 4,345 Bytes
1ae58ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# /extractor.py
"""
Handles content extraction from various sources like files, images, and websites.
This module encapsulates the logic for parsing different file formats (PDF, DOCX),
performing Optical Character Recognition (OCR) on images, and scraping web content.
"""
import mimetypes
import os
import re
from urllib.parse import urlparse, urljoin
import logging
import PyPDF2
import docx
import requests
from bs4 import BeautifulSoup
# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# --- Optional OCR Imports ---
try:
import cv2
import numpy as np
import pytesseract
OCR_AVAILABLE = True
except ImportError:
OCR_AVAILABLE = False
logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
def extract_text_from_image(image_path: str) -> str:
"""Extracts text from an image file using Tesseract OCR."""
if not OCR_AVAILABLE:
return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
try:
pytesseract.get_tesseract_version()
except Exception:
return "Error: Tesseract OCR is not installed or not in your PATH."
try:
image = cv2.imread(image_path)
if image is None:
return "Error: Could not read image file."
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray)
return text.strip() or "No text found in image."
except Exception as e:
logging.error(f"OCR extraction failed: {e}")
return f"Error during OCR: {e}"
def extract_text_from_file(file_path: str) -> str:
"""Extracts text from a variety of file types."""
if not file_path:
return ""
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
return "\n".join(page.extract_text() or "" for page in reader.pages)
elif ext == ".docx":
doc = docx.Document(file_path)
return "\n".join(p.text for p in doc.paragraphs)
elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
return extract_text_from_image(file_path)
else:
return f"Unsupported file type: {ext}"
except Exception as e:
logging.error(f"Error extracting text from {file_path}: {e}")
return f"Error extracting text: {e}"
def extract_website_content(url: str) -> str:
"""Scrapes and returns the primary HTML content of a given URL."""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
# Make all resource links absolute
for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
for item in soup.find_all(tag):
if item.has_attr(attr):
item[attr] = urljoin(url, item[attr])
title = soup.title.string if soup.title else "N/A"
# Return a prettified version of the body content for context
body_content = soup.body.prettify() if soup.body else str(soup)
# Truncate for prompt
if len(body_content) > 15000:
body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
except requests.RequestException as e:
logging.error(f"Website extraction failed for {url}: {e}")
return f"Error: Could not fetch content from the URL. Details: {e}"
except Exception as e:
logging.error(f"An unexpected error occurred during website extraction: {e}")
return f"Error: An unexpected error occurred. Details: {e}" |