Spaces:
Runtime error
Runtime error
File size: 2,056 Bytes
836bc0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
from io import BytesIO
import requests
import os
from .pdf_extractor import parse_pdf_from_url_multithreaded, parse_pdf_from_file_multithreaded
from .image_extractor import is_image, extract_text_from_image_bytes
from .web_extractor import extract_text_from_html
from .zip_extractor import extract_from_zip_bytes
def parse_document_url(url):
try:
res = requests.get(url)
content = res.content
content_type = res.headers.get("content-type", "").lower()
except Exception as e:
return [f"Download error: {str(e)}"]
if "text/html" in content_type or url.endswith(".html"):
return extract_text_from_html(content)
if "zip" in content_type or url.endswith(".zip"):
zip_results = extract_from_zip_bytes(content)
return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]
if "image" in content_type or is_image(content):
text = extract_text_from_image_bytes(content)
return [text] if text else ["No data found (image empty)"]
if "pdf" in content_type or url.endswith(".pdf"):
return parse_pdf_from_url_multithreaded(BytesIO(content))
return ["Unsupported file type"]
def parse_document_file(file_path):
if file_path.lower().endswith(".zip"):
with open(file_path, "rb") as f:
zip_results = extract_from_zip_bytes(f.read())
return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]
if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff", ".webp")):
with open(file_path, "rb") as f:
text = extract_text_from_image_bytes(f.read())
return [text] if text else ["No data found (image empty)"]
if file_path.lower().endswith(".pdf"):
return parse_pdf_from_file_multithreaded(file_path)
if file_path.lower().endswith(".html"):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
return extract_text_from_html(content)
return ["Unsupported file type"]
|