Spaces:

Rivalcoder
/

Issurance_Agent_Rag

Runtime error

App Files Files Community

Issurance_Agent_Rag / content_readers /__init__.py

Rivalcoder

New Version Updated

836bc0e 2 days ago

raw

history blame contribute delete

2.06 kB

	from io import BytesIO
	import requests
	import os
	from .pdf_extractor import parse_pdf_from_url_multithreaded, parse_pdf_from_file_multithreaded
	from .image_extractor import is_image, extract_text_from_image_bytes
	from .web_extractor import extract_text_from_html
	from .zip_extractor import extract_from_zip_bytes

	def parse_document_url(url):
	try:
	res = requests.get(url)
	content = res.content
	content_type = res.headers.get("content-type", "").lower()
	except Exception as e:
	return [f"Download error: {str(e)}"]

	if "text/html" in content_type or url.endswith(".html"):
	return extract_text_from_html(content)

	if "zip" in content_type or url.endswith(".zip"):
	zip_results = extract_from_zip_bytes(content)
	return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]

	if "image" in content_type or is_image(content):
	text = extract_text_from_image_bytes(content)
	return [text] if text else ["No data found (image empty)"]

	if "pdf" in content_type or url.endswith(".pdf"):
	return parse_pdf_from_url_multithreaded(BytesIO(content))

	return ["Unsupported file type"]

	def parse_document_file(file_path):
	if file_path.lower().endswith(".zip"):
	with open(file_path, "rb") as f:
	zip_results = extract_from_zip_bytes(f.read())
	return [f"{name}: {text}" for name, texts in zip_results.items() for text in texts]

	if file_path.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif", ".tiff", ".webp")):
	with open(file_path, "rb") as f:
	text = extract_text_from_image_bytes(f.read())
	return [text] if text else ["No data found (image empty)"]

	if file_path.lower().endswith(".pdf"):
	return parse_pdf_from_file_multithreaded(file_path)

	if file_path.lower().endswith(".html"):
	with open(file_path, "r", encoding="utf-8") as f:
	content = f.read()
	return extract_text_from_html(content)

	return ["Unsupported file type"]