|
import asyncio |
|
import re |
|
from pathlib import Path |
|
|
|
from pyzerox import zerox |
|
|
|
|
|
def remove_images_from_markdown(markdown_text): |
|
|
|
markdown_text = re.sub(r"<img[^>]*>", "", markdown_text) |
|
markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text) |
|
return markdown_text |
|
|
|
|
|
ZEROX_DEBUG_PATH = Path("/tmp/zerox_debug") |
|
ZEROX_DEBUG_PATH.mkdir(exist_ok=True) |
|
MODEL_NAME = "gemini/gemini-2.0-flash" |
|
|
|
|
|
def clean_up_html_code_block(text: str): |
|
|
|
text = text.replace("```html", "") |
|
text = text.replace("```", "") |
|
return text |
|
|
|
|
|
def convert_zerox(path: str, file_name: str): |
|
output_dir = ZEROX_DEBUG_PATH / file_name |
|
output_dir.mkdir(exist_ok=True) |
|
|
|
async def async_convert(): |
|
return await zerox( |
|
concurrency=4, |
|
file_path=path, |
|
model=MODEL_NAME, |
|
output_dir=output_dir, |
|
) |
|
|
|
output = asyncio.run(async_convert()) |
|
output_text = "\n\n".join(page.content for page in output.pages) |
|
output_text = clean_up_html_code_block(output_text) |
|
output_text = remove_images_from_markdown(output_text) |
|
return output_text, [] |
|
|