File size: 2,657 Bytes
1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 1ae58ff 9686c37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# /extractor.py
""" Handles content extraction from various sources like files, images, and websites. """
import mimetypes, os, re, logging
from urllib.parse import urljoin
import PyPDF2, docx, requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
try:
import cv2, pytesseract
OCR_AVAILABLE = True
except ImportError:
OCR_AVAILABLE = False
logging.warning("OCR libraries not found. Text extraction from images will be disabled.")
def extract_text_from_image(image_path: str) -> str:
if not OCR_AVAILABLE: return "Error: OCR dependencies not installed."
try:
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
return pytesseract.image_to_string(gray) or "No text found in image."
except Exception as e: return f"Error during OCR: {e}"
def extract_text_from_file(file_path: str) -> str:
if not file_path: return ""
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".pdf":
with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages)
elif ext == ".docx":
return "\n".join(p.text for p in docx.Document(file_path).paragraphs)
elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read()
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
return extract_text_from_image(file_path)
else: return f"Unsupported file type: {ext}"
except Exception as e: return f"Error extracting text: {e}"
def extract_website_content(url: str) -> str:
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
response.raise_for_status()
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
for item in soup.find_all(tag):
if item.has_attr(attr): item[attr] = urljoin(url, item[attr])
body_content = str(soup)
if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
return f"<!-- Original URL: {url} -->\n{body_content}"
except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}" |