File size: 2,038 Bytes
836bc0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import fitz  # PyMuPDF
from concurrent.futures import ThreadPoolExecutor

def _extract_text(page):
    text = page.get_text()
    return text.strip() if text and text.strip() else None

def parse_pdf_from_url_multithreaded(content, max_workers=2, chunk_size=1):
    try:
        with fitz.open(stream=content, filetype="pdf") as doc:
            pages = list(doc)
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                texts = list(executor.map(_extract_text, pages))
            if chunk_size > 1:
                chunks = []
                for i in range(0, len(texts), chunk_size):
                    chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
                    if chunk:
                        chunks.append(chunk)
                return chunks if chunks else ["No data found in this document (empty PDF)"]
            return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
    except Exception as e:
        print(f"❌ Failed to parse as PDF: {str(e)}")
        return [f"No data found in this document (not PDF or corrupted)"]

def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
    try:
        with fitz.open(file_path) as doc:
            pages = list(doc)
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                texts = list(executor.map(_extract_text, pages))
            if chunk_size > 1:
                chunks = []
                for i in range(0, len(texts), chunk_size):
                    chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
                    if chunk:
                        chunks.append(chunk)
                return chunks if chunks else ["No data found in this document (local PDF empty)"]
            return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
    except Exception as e:
        print(f"❌ Failed to open local file: {str(e)}")
        return [f"No data found in this document (local file error)"]