Spaces:
Runtime error
Runtime error
import zipfile | |
from io import BytesIO | |
from .pdf_extractor import parse_pdf_from_url_multithreaded | |
from .image_extractor import is_image, extract_text_from_image_bytes | |
def extract_from_zip_bytes(zip_bytes): | |
""" | |
Extract and process files inside a ZIP archive. | |
Returns a dictionary: {filename: extracted_text_list} | |
""" | |
results = {} | |
try: | |
with zipfile.ZipFile(BytesIO(zip_bytes)) as z: | |
for file_name in z.namelist(): | |
try: | |
file_data = z.read(file_name) | |
except Exception as e: | |
results[file_name] = [f"❌ Failed to read file: {e}"] | |
continue | |
# PDF files | |
if file_name.lower().endswith(".pdf"): | |
results[file_name] = parse_pdf_from_url_multithreaded(BytesIO(file_data)) | |
# Image files | |
elif is_image(file_data): | |
text = extract_text_from_image_bytes(file_data) | |
results[file_name] = [text] if text else ["No data found (image empty)"] | |
# Unsupported files | |
else: | |
results[file_name] = ["⚠ Unsupported file type inside ZIP"] | |
return results if results else {"ZIP": ["No supported files found in archive"]} | |
except zipfile.BadZipFile: | |
return {"ZIP": ["Invalid or corrupted ZIP file"]} | |
except Exception as e: | |
return {"ZIP": [f"Error processing ZIP: {e}"]} | |