Rivalcoder's picture
New Version Updated
836bc0e
import zipfile
from io import BytesIO
from .pdf_extractor import parse_pdf_from_url_multithreaded
from .image_extractor import is_image, extract_text_from_image_bytes
def extract_from_zip_bytes(zip_bytes):
"""
Extract and process files inside a ZIP archive.
Returns a dictionary: {filename: extracted_text_list}
"""
results = {}
try:
with zipfile.ZipFile(BytesIO(zip_bytes)) as z:
for file_name in z.namelist():
try:
file_data = z.read(file_name)
except Exception as e:
results[file_name] = [f"❌ Failed to read file: {e}"]
continue
# PDF files
if file_name.lower().endswith(".pdf"):
results[file_name] = parse_pdf_from_url_multithreaded(BytesIO(file_data))
# Image files
elif is_image(file_data):
text = extract_text_from_image_bytes(file_data)
results[file_name] = [text] if text else ["No data found (image empty)"]
# Unsupported files
else:
results[file_name] = ["⚠ Unsupported file type inside ZIP"]
return results if results else {"ZIP": ["No supported files found in archive"]}
except zipfile.BadZipFile:
return {"ZIP": ["Invalid or corrupted ZIP file"]}
except Exception as e:
return {"ZIP": [f"Error processing ZIP: {e}"]}