File size: 1,485 Bytes
836bc0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import zipfile
from io import BytesIO
from .pdf_extractor import parse_pdf_from_url_multithreaded
from .image_extractor import is_image, extract_text_from_image_bytes

def extract_from_zip_bytes(zip_bytes):
    """
    Extract and process files inside a ZIP archive.
    Returns a dictionary: {filename: extracted_text_list}
    """
    results = {}
    try:
        with zipfile.ZipFile(BytesIO(zip_bytes)) as z:
            for file_name in z.namelist():
                try:
                    file_data = z.read(file_name)
                except Exception as e:
                    results[file_name] = [f"❌ Failed to read file: {e}"]
                    continue

                # PDF files
                if file_name.lower().endswith(".pdf"):
                    results[file_name] = parse_pdf_from_url_multithreaded(BytesIO(file_data))

                # Image files
                elif is_image(file_data):
                    text = extract_text_from_image_bytes(file_data)
                    results[file_name] = [text] if text else ["No data found (image empty)"]

                # Unsupported files
                else:
                    results[file_name] = ["⚠ Unsupported file type inside ZIP"]

        return results if results else {"ZIP": ["No supported files found in archive"]}

    except zipfile.BadZipFile:
        return {"ZIP": ["Invalid or corrupted ZIP file"]}
    except Exception as e:
        return {"ZIP": [f"Error processing ZIP: {e}"]}