Spaces:
Runtime error
Runtime error
File size: 2,038 Bytes
836bc0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import fitz # PyMuPDF
from concurrent.futures import ThreadPoolExecutor
def _extract_text(page):
text = page.get_text()
return text.strip() if text and text.strip() else None
def parse_pdf_from_url_multithreaded(content, max_workers=2, chunk_size=1):
try:
with fitz.open(stream=content, filetype="pdf") as doc:
pages = list(doc)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks if chunks else ["No data found in this document (empty PDF)"]
return [t for t in texts if t] or ["No data found in this document (empty PDF)"]
except Exception as e:
print(f"❌ Failed to parse as PDF: {str(e)}")
return [f"No data found in this document (not PDF or corrupted)"]
def parse_pdf_from_file_multithreaded(file_path, max_workers=2, chunk_size=1):
try:
with fitz.open(file_path) as doc:
pages = list(doc)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
texts = list(executor.map(_extract_text, pages))
if chunk_size > 1:
chunks = []
for i in range(0, len(texts), chunk_size):
chunk = ' '.join([t for t in texts[i:i+chunk_size] if t])
if chunk:
chunks.append(chunk)
return chunks if chunks else ["No data found in this document (local PDF empty)"]
return [t for t in texts if t] or ["No data found in this document (local PDF empty)"]
except Exception as e:
print(f"❌ Failed to open local file: {str(e)}")
return [f"No data found in this document (local file error)"]
|