File size: 2,056 Bytes
836bc0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from io import BytesIO
import requests
import os
from .pdf_extractor import parse_pdf_from_url_multithreaded, parse_pdf_from_file_multithreaded
from .image_extractor import is_image, extract_text_from_image_bytes
from .web_extractor import extract_text_from_html
from .zip_extractor import extract_from_zip_bytes

def parse_document_url(url):
    try:
        res = requests.get(url)
        content = res.content
        content_type = res.headers.get("content-type", "").lower()
    except Exception as e:
        return [f"Download error: {str(e)}"]

    if "text/html" in content_type or url.endswith(".html"):
        return extract_text_from_html(content)

    if "zip" in content_type or url.endswith(".zip"):
        zip_results = extract_from_zip_bytes(content)
        return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]

    if "image" in content_type or is_image(content):
        text = extract_text_from_image_bytes(content)
        return [text] if text else ["No data found (image empty)"]

    if "pdf" in content_type or url.endswith(".pdf"):
        return parse_pdf_from_url_multithreaded(BytesIO(content))

    return ["Unsupported file type"]

def parse_document_file(file_path):
    if file_path.lower().endswith(".zip"):
        with open(file_path, "rb") as f:
            zip_results = extract_from_zip_bytes(f.read())
        return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]

    if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff", ".webp")):
        with open(file_path, "rb") as f:
            text = extract_text_from_image_bytes(f.read())
        return [text] if text else ["No data found (image empty)"]

    if file_path.lower().endswith(".pdf"):
        return parse_pdf_from_file_multithreaded(file_path)

    if file_path.lower().endswith(".html"):
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
        return extract_text_from_html(content)

    return ["Unsupported file type"]