File size: 1,207 Bytes
acbe414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import asyncio
import re
from pathlib import Path

from pyzerox import zerox


def remove_images_from_markdown(markdown_text):
    # remove <image> and ![image](path) from markdown
    markdown_text = re.sub(r"<img[^>]*>", "", markdown_text)
    markdown_text = re.sub(r"!\[[^\]]*\]\([^)]*\)", "", markdown_text)
    return markdown_text


ZEROX_DEBUG_PATH = Path("/tmp/zerox_debug")
ZEROX_DEBUG_PATH.mkdir(exist_ok=True)
MODEL_NAME = "gemini/gemini-2.0-flash"


def clean_up_html_code_block(text: str):
    # remove ```html and ``` from text
    text = text.replace("```html", "")
    text = text.replace("```", "")
    return text


def convert_zerox(path: str, file_name: str):
    output_dir = ZEROX_DEBUG_PATH / file_name
    output_dir.mkdir(exist_ok=True)

    async def async_convert():
        return await zerox(
            concurrency=4,
            file_path=path,
            model=MODEL_NAME,
            output_dir=output_dir,
        )

    output = asyncio.run(async_convert())
    output_text = "\n\n".join(page.content for page in output.pages)
    output_text = clean_up_html_code_block(output_text)
    output_text = remove_images_from_markdown(output_text)
    return output_text, []