Spaces:
Runtime error
Runtime error
File size: 1,485 Bytes
836bc0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import zipfile
from io import BytesIO
from .pdf_extractor import parse_pdf_from_url_multithreaded
from .image_extractor import is_image, extract_text_from_image_bytes
def extract_from_zip_bytes(zip_bytes):
"""
Extract and process files inside a ZIP archive.
Returns a dictionary: {filename: extracted_text_list}
"""
results = {}
try:
with zipfile.ZipFile(BytesIO(zip_bytes)) as z:
for file_name in z.namelist():
try:
file_data = z.read(file_name)
except Exception as e:
results[file_name] = [f"❌ Failed to read file: {e}"]
continue
# PDF files
if file_name.lower().endswith(".pdf"):
results[file_name] = parse_pdf_from_url_multithreaded(BytesIO(file_data))
# Image files
elif is_image(file_data):
text = extract_text_from_image_bytes(file_data)
results[file_name] = [text] if text else ["No data found (image empty)"]
# Unsupported files
else:
results[file_name] = ["⚠ Unsupported file type inside ZIP"]
return results if results else {"ZIP": ["No supported files found in archive"]}
except zipfile.BadZipFile:
return {"ZIP": ["Invalid or corrupted ZIP file"]}
except Exception as e:
return {"ZIP": [f"Error processing ZIP: {e}"]}
|