Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -33,14 +33,13 @@ def read_fn(path):
|
|
| 33 |
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
| 34 |
|
| 35 |
|
| 36 |
-
|
| 37 |
-
def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
| 38 |
os.makedirs(output_dir, exist_ok=True)
|
| 39 |
|
| 40 |
try:
|
| 41 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
| 42 |
pdf_data = read_fn(doc_path)
|
| 43 |
-
if
|
| 44 |
parse_method = "ocr"
|
| 45 |
else:
|
| 46 |
parse_method = "auto"
|
|
@@ -53,6 +52,10 @@ def parse_pdf(doc_path, output_dir, end_page_id, ocr):
|
|
| 53 |
parse_method,
|
| 54 |
False,
|
| 55 |
end_page_id=end_page_id,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
)
|
| 57 |
return local_md_dir, file_name
|
| 58 |
except Exception as e:
|
|
@@ -104,9 +107,10 @@ def replace_image_with_base64(markdown_text, image_dir_path):
|
|
| 104 |
return re.sub(pattern, replace, markdown_text)
|
| 105 |
|
| 106 |
|
| 107 |
-
def to_markdown(file_path, end_pages,
|
| 108 |
# 获取识别的md文件以及压缩包文件路径
|
| 109 |
-
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1,
|
|
|
|
| 110 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
| 111 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
| 112 |
if zip_archive_success == 0:
|
|
@@ -149,6 +153,27 @@ with open("header.html", "r") as file:
|
|
| 149 |
header = file.read()
|
| 150 |
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
if __name__ == "__main__":
|
| 153 |
with gr.Blocks() as demo:
|
| 154 |
gr.HTML(header)
|
|
@@ -156,8 +181,14 @@ if __name__ == "__main__":
|
|
| 156 |
with gr.Column(variant='panel', scale=5):
|
| 157 |
pdf_show = gr.Markdown()
|
| 158 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 159 |
-
with gr.Row()
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
change_bu = gr.Button("Convert")
|
| 162 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
| 163 |
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
|
|
@@ -177,7 +208,8 @@ if __name__ == "__main__":
|
|
| 177 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 178 |
with gr.Tab("Markdown text"):
|
| 179 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 180 |
-
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr
|
|
|
|
| 181 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 182 |
|
| 183 |
demo.launch()
|
|
|
|
| 33 |
return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
|
| 34 |
|
| 35 |
|
| 36 |
+
def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, layout_mode, formula_enable, table_enable, language):
|
|
|
|
| 37 |
os.makedirs(output_dir, exist_ok=True)
|
| 38 |
|
| 39 |
try:
|
| 40 |
file_name = f"{str(Path(doc_path).stem)}_{time.time()}"
|
| 41 |
pdf_data = read_fn(doc_path)
|
| 42 |
+
if is_ocr:
|
| 43 |
parse_method = "ocr"
|
| 44 |
else:
|
| 45 |
parse_method = "auto"
|
|
|
|
| 52 |
parse_method,
|
| 53 |
False,
|
| 54 |
end_page_id=end_page_id,
|
| 55 |
+
layout_model=layout_mode,
|
| 56 |
+
formula_enable=formula_enable,
|
| 57 |
+
table_enable=table_enable,
|
| 58 |
+
lang=language,
|
| 59 |
)
|
| 60 |
return local_md_dir, file_name
|
| 61 |
except Exception as e:
|
|
|
|
| 107 |
return re.sub(pattern, replace, markdown_text)
|
| 108 |
|
| 109 |
|
| 110 |
+
def to_markdown(file_path, end_pages, is_ocr, layout_mode, formula_enable, table_enable, language):
|
| 111 |
# 获取识别的md文件以及压缩包文件路径
|
| 112 |
+
local_md_dir, file_name = parse_pdf(file_path, './output', end_pages - 1, is_ocr,
|
| 113 |
+
layout_mode, formula_enable, table_enable, language)
|
| 114 |
archive_zip_path = os.path.join("./output", compute_sha256(local_md_dir) + ".zip")
|
| 115 |
zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path)
|
| 116 |
if zip_archive_success == 0:
|
|
|
|
| 153 |
header = file.read()
|
| 154 |
|
| 155 |
|
| 156 |
+
latin_lang = [
|
| 157 |
+
'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
| 158 |
+
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
| 159 |
+
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
| 160 |
+
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german'
|
| 161 |
+
]
|
| 162 |
+
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
| 163 |
+
cyrillic_lang = [
|
| 164 |
+
'ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
| 165 |
+
'dar', 'inh', 'che', 'lbe', 'lez', 'tab'
|
| 166 |
+
]
|
| 167 |
+
devanagari_lang = [
|
| 168 |
+
'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
|
| 169 |
+
'sa', 'bgc'
|
| 170 |
+
]
|
| 171 |
+
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
| 172 |
+
|
| 173 |
+
all_lang = [""]
|
| 174 |
+
all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang])
|
| 175 |
+
|
| 176 |
+
|
| 177 |
if __name__ == "__main__":
|
| 178 |
with gr.Blocks() as demo:
|
| 179 |
gr.HTML(header)
|
|
|
|
| 181 |
with gr.Column(variant='panel', scale=5):
|
| 182 |
pdf_show = gr.Markdown()
|
| 183 |
max_pages = gr.Slider(1, 10, 5, step=1, label="Max convert pages")
|
| 184 |
+
with gr.Row():
|
| 185 |
+
layout_mode = gr.Dropdown(["layoutlmv3", "doclayout_yolo"], label="Layout model", value="layoutlmv3")
|
| 186 |
+
language = gr.Dropdown(all_lang, label="Language", value="")
|
| 187 |
+
with gr.Row():
|
| 188 |
+
formula_enable = gr.Checkbox(label="Enable formula recognition", value=True)
|
| 189 |
+
is_ocr = gr.Checkbox(label="Force enable OCR", value=False)
|
| 190 |
+
table_enable = gr.Checkbox(label="Enable table recognition(test)", value=False)
|
| 191 |
+
with gr.Row():
|
| 192 |
change_bu = gr.Button("Convert")
|
| 193 |
clear_bu = gr.ClearButton([pdf_show], value="Clear")
|
| 194 |
pdf_show = PDF(label="Please upload pdf", interactive=True, height=800)
|
|
|
|
| 208 |
latex_delimiters=latex_delimiters, line_breaks=True)
|
| 209 |
with gr.Tab("Markdown text"):
|
| 210 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
| 211 |
+
change_bu.click(fn=to_markdown, inputs=[pdf_show, max_pages, is_ocr, layout_mode, formula_enable, table_enable, language],
|
| 212 |
+
outputs=[md, md_text, output_file, pdf_show])
|
| 213 |
clear_bu.add([md, pdf_show, md_text, output_file, is_ocr])
|
| 214 |
|
| 215 |
demo.launch()
|