Spaces:
Runtime error
Runtime error
from io import BytesIO | |
import requests | |
import os | |
from .pdf_extractor import parse_pdf_from_url_multithreaded, parse_pdf_from_file_multithreaded | |
from .image_extractor import is_image, extract_text_from_image_bytes | |
from .web_extractor import extract_text_from_html | |
from .zip_extractor import extract_from_zip_bytes | |
def parse_document_url(url): | |
try: | |
res = requests.get(url) | |
content = res.content | |
content_type = res.headers.get("content-type", "").lower() | |
except Exception as e: | |
return [f"Download error: {str(e)}"] | |
if "text/html" in content_type or url.endswith(".html"): | |
return extract_text_from_html(content) | |
if "zip" in content_type or url.endswith(".zip"): | |
zip_results = extract_from_zip_bytes(content) | |
return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts] | |
if "image" in content_type or is_image(content): | |
text = extract_text_from_image_bytes(content) | |
return [text] if text else ["No data found (image empty)"] | |
if "pdf" in content_type or url.endswith(".pdf"): | |
return parse_pdf_from_url_multithreaded(BytesIO(content)) | |
return ["Unsupported file type"] | |
def parse_document_file(file_path): | |
if file_path.lower().endswith(".zip"): | |
with open(file_path, "rb") as f: | |
zip_results = extract_from_zip_bytes(f.read()) | |
return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts] | |
if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff", ".webp")): | |
with open(file_path, "rb") as f: | |
text = extract_text_from_image_bytes(f.read()) | |
return [text] if text else ["No data found (image empty)"] | |
if file_path.lower().endswith(".pdf"): | |
return parse_pdf_from_file_multithreaded(file_path) | |
if file_path.lower().endswith(".html"): | |
with open(file_path, "r", encoding="utf-8") as f: | |
content = f.read() | |
return extract_text_from_html(content) | |
return ["Unsupported file type"] | |