Spaces:

cxeep
/

UIE-X

Runtime error

App Files Files Community

cxeep

linjieccc commited on Apr 6, 2023

Commit

bb7a774

0 Parent(s):

Duplicate from PaddlePaddle/UIE-X

Browse files

Co-authored-by: Linjie Chen <[email protected]>

Files changed (12) hide show

.gitattributes +37 -0
README.md +14 -0
app.py +378 -0
business_card.png +3 -0
custom.jpeg +3 -0
footer.html +4 -0
header.html +18 -0
invoice.jpeg +3 -0
license.jpeg +3 -0
requirements.txt +6 -0
resume.png +3 -0
statements.png +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,37 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.psd filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: UIE-X
+emoji: 📄
+colorFrom: gray
+colorTo: pink
+sdk: gradio
+sdk_version: 3.4.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: PaddlePaddle/UIE-X
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,378 @@

+#-*- coding: UTF-8 -*-
+# Copyright 2022 the HuggingFace Team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import traceback
+import base64
+import gradio as gr
+import cv2
+from paddlenlp import Taskflow
+from paddlenlp.utils.doc_parser import DocParser
+doc_parser = DocParser()
+task_instance = Taskflow(
+    "information_extraction",
+    model="uie-x-base",
+    task_path="PaddlePaddle/uie-x-base",
+    from_hf_hub=True)
+examples = [
+    [
+        "business_card.png",
+        "Name;Title;Web Link;Email;Address",
+    ],
+    [
+        "license.jpeg",
+        "Name;DOB;ISS;EXP",
+    ],
+    [
+        "statements.png",
+        "Date|Gross profit",
+    ],
+    [
+        "invoice.jpeg",
+        "名称;纳税人识别号;开票日期",
+    ],
+    [
+        "custom.jpeg",
+        "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号"
+    ],
+    [
+        "resume.png",
+        "职位;年龄;学校|时间;学校|专业",
+    ],
+]
+example_files = {
+    "Name;Title;Web Link;Email;Address": "business_card.png",
+    "Name;DOB;ISS;EXP": "license.jpeg",
+    "Date|Gross profit": "statements.png",
+    "职位;年龄;学校|时间;学校|专业": "resume.png",
+    "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg",
+    "名称;纳税人识别号;开票日期": "invoice.jpeg",
+}
+lang_map = {
+    "resume.png": "ch",
+    "custom.jpeg": "ch",
+    "business_card.png": "en",
+    "invoice.jpeg": "ch",
+    "license.jpeg": "en",
+    "statements.png": "en",
+}
+def dbc2sbc(s):
+    rs = ""
+    for char in s:
+        code = ord(char)
+        if code == 0x3000:
+            code = 0x0020
+        else:
+            code -= 0xfee0
+        if not (0x0021 <= code and code <= 0x7e):
+            rs += char
+            continue
+        rs += chr(code)
+    return rs
+def np2base64(image_np):
+    image = cv2.imencode('.jpg', image_np)[1]
+    base64_str = str(base64.b64encode(image))[2:-1]
+    return base64_str
+def process_path(path):
+    error = None
+    if path:
+        try:
+            if path.endswith(".pdf"):
+                images_list = [doc_parser.read_pdf(path)]
+            else:
+                images_list = [doc_parser.read_image(path)]
+            return (
+                path,
+                gr.update(visible=True, value=images_list),
+                gr.update(visible=True),
+                gr.update(visible=False, value=None),
+                gr.update(visible=False, value=None),
+                None,
+            )
+        except Exception as e:
+            traceback.print_exc()
+            error = str(e)
+    return (
+        None,
+        gr.update(visible=False, value=None),
+        gr.update(visible=False),
+        gr.update(visible=False, value=None),
+        gr.update(visible=False, value=None),
+        gr.update(visible=True, value=error) if error is not None else None,
+        None,
+    )
+def process_upload(file):
+    if file:
+        return process_path(file.name)
+    else:
+        return (
+            None,
+            gr.update(visible=False, value=None),
+            gr.update(visible=False),
+            gr.update(visible=False, value=None),
+            gr.update(visible=False, value=None),
+            None,
+        )
+def get_schema(schema_str):
+    def _is_ch(s):
+        for ch in s:
+            if "\u4e00" <= ch <= "\u9fff":
+                return True
+        return False
+    schema_lang = "ch" if _is_ch(schema_str) else "en"
+    schema = schema_str.split(";")
+    schema_list = []
+    for s in schema:
+        cand = s.split("|")
+        if len(cand) == 1:
+            schema_list.append(cand[0])
+        else:
+            subject = cand[0]
+            relations = cand[1:]
+            added = False
+            for a in schema_list:
+                if isinstance(a, dict):
+                    if subject in a.keys():
+                        a[subject].extend(relations)
+                        added = True
+                        break
+            if not added:
+                a = {subject: relations}
+                schema_list.append(a)
+    return schema_list, schema_lang
+def run_taskflow(document, schema, argument):
+    task_instance.set_schema(schema)
+    task_instance.set_argument(argument)
+    return task_instance({'doc': document})
+def process_doc(document, schema, ocr_lang, layout_analysis):
+    if [document, schema] in examples:
+        ocr_lang = lang_map[document]
+    if not schema:
+        schema = '时间;组织机构;人物'
+    if document is None:
+        return None, None
+    layout_analysis = True if layout_analysis == "yes" else False
+    schema, schema_lang = get_schema(dbc2sbc(schema))
+    argument = {
+        "ocr_lang": ocr_lang,
+        "schema_lang": schema_lang,
+        "layout_analysis": layout_analysis
+    }
+    prediction = run_taskflow(document, schema, argument)[0]
+    if document.endswith(".pdf"):
+        _image = doc_parser.read_pdf(document)
+    else:
+        _image = doc_parser.read_image(document)
+    img_show = doc_parser.write_image_with_results(
+        np2base64(_image),
+        result=prediction,
+        return_image=True)
+    img_list = [img_show]
+    return (
+        gr.update(visible=True, value=img_list),
+        gr.update(visible=True, value=prediction),
+    )
+def load_example_document(img, schema, ocr_lang, layout_analysis):
+    if img is not None:
+        document = example_files[schema]
+        choice = lang_map[document].split("-")
+        ocr_lang = choice[0]
+        preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
+        return document, schema, preview, gr.update(visible=True), answer
+    else:
+        return None, None, None, gr.update(visible=False), None
+def read_content(file_path: str) -> str:
+    """read the content of target file
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+    return content
+with gr.Blocks() as demo:
+    gr.HTML(read_content("header.html"))
+    gr.Markdown(
+        "Open-sourced by [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP), **UIE-X** is a universal information extraction engine for both scanned document and text inputs. It supports Entity Extraction, Relation Extraction and Event Extraction tasks. "
+        "UIE-X performs well on a zero-shot settings, which is enabled by a flexible schema that allows you to specify extraction targets with simple natural language. "
+        "Moreover, on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP), we provide a comprehensive and easy-to-use fine-tuning and few-shot customization workflow. <br>"
+        "Want to dive deeper? Check out our [AIStudio Notebook](https://aistudio.baidu.com/aistudio/projectdetail/5261592) and [Colab Notebook](https://colab.research.google.com/drive/1ZY_ELZgoemJNoa6baWpgtzebLgoCT8MK?usp=sharing). "
+        "For more details, please visit our [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/README_en.md)"
+    )
+    document = gr.Variable()
+    is_text = gr.Variable()
+    example_schema = gr.Textbox(visible=False)
+    example_image = gr.Image(visible=False)
+    with gr.Row(equal_height=True):
+        with gr.Column():
+            with gr.Row():
+                gr.Markdown("## 1. Select a file 选择文件", elem_id="select-a-file")
+                img_clear_button = gr.Button(
+                    "Clear", variant="secondary", elem_id="file-clear", visible=False
+                )
+            image = gr.Gallery(visible=False)
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    with gr.Row():
+                        url = gr.Textbox(
+                            show_label=False,
+                            placeholder="URL",
+                            lines=1,
+                            max_lines=1,
+                            elem_id="url-textbox",
+                        )
+                        submit = gr.Button("Get")
+                    url_error = gr.Textbox(
+                        visible=False,
+                        elem_id="url-error",
+                        max_lines=1,
+                        interactive=False,
+                        label="Error",
+                    )
+            gr.Markdown("## <center> — or — </center>")
+            upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
+            gr.Examples(
+                examples=examples,
+                inputs=[example_image, example_schema],
+            )
+        with gr.Column():
+            gr.Markdown("## 2. Information Extraction 信息抽取 ")
+            gr.Markdown("### 👉 Set a schema 设置schema")
+            gr.Markdown("Entity extraction: entity type should be separated by ';', e.g. **Person;Organization**")
+            gr.Markdown("实体抽取：实体类别之间以';'分割，例如 **人物；组织机构**")
+            gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**")
+            gr.Markdown("关系抽取：需配置主体和关系类别，中间以'|'分割，例如 **人物|出生时间；人物|邮箱**")
+            gr.Markdown("### 👉 Model customization 模型定制")
+            gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)")
+            gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)的流程进一步增强模型在特定场景的效果")
+            schema = gr.Textbox(
+                label="Schema",
+                placeholder="e.g. Name|Company;Name|Position;Email;Phone Number",
+                lines=1,
+                max_lines=1,
+            )
+            ocr_lang = gr.Radio(
+                choices=["ch", "en"],
+                value="en",
+                label="OCR语言 / OCR Language (Please choose ch for Chinese images.)",
+            )
+            layout_analysis = gr.Radio(
+                choices=["yes", "no"],
+                value="no",
+                label="版面分析 / Layout analysis (Better extraction for multi-line text)",
+            )
+            with gr.Row():
+                clear_button = gr.Button("Clear", variant="secondary")
+                submit_button = gr.Button(
+                    "Submit", variant="primary", elem_id="submit-button"
+                )
+            with gr.Column():
+                output = gr.JSON(label="Output", visible=False)
+    for cb in [img_clear_button, clear_button]:
+        cb.click(
+            lambda _: (
+                gr.update(visible=False, value=None),
+                None,
+                gr.update(visible=False, value=None),
+                gr.update(visible=False),
+                None,
+                None,
+                None,
+                gr.update(visible=False, value=None),
+                None,
+            ),
+            inputs=clear_button,
+            outputs=[
+                image,
+                document,
+                output,
+                img_clear_button,
+                example_image,
+                upload,
+                url,
+                url_error,
+                schema,
+            ],
+        )
+    upload.change(
+        fn=process_upload,
+        inputs=[upload],
+        outputs=[document, image, img_clear_button, output, url_error],
+    )
+    submit.click(
+        fn=process_path,
+        inputs=[url],
+        outputs=[document, image, img_clear_button, output, url_error],
+    )
+    schema.submit(
+        fn=process_doc,
+        inputs=[document, schema, ocr_lang, layout_analysis],
+        outputs=[image, output],
+    )
+    submit_button.click(
+        fn=process_doc,
+        inputs=[document, schema, ocr_lang, layout_analysis],
+        outputs=[image, output],
+    )
+    example_image.change(
+        fn=load_example_document,
+        inputs=[example_image, example_schema, ocr_lang, layout_analysis],
+        outputs=[document, schema, image, img_clear_button, output],
+    )
+    gr.HTML(read_content("footer.html"))
+if __name__ == "__main__":
+    demo.queue().launch()

business_card.png ADDED Viewed

Git LFS Details

SHA256: 68aa93a2b4122a517fac752507a4c65218fbaccbf16385afec02dbac0ecdbbdc
Pointer size: 131 Bytes
Size of remote file: 313 kB

custom.jpeg ADDED Viewed

Git LFS Details

SHA256: b0d83ab6cac4747e00192474a2e8636285bddfcab884c4083ad30c6284f13b10
Pointer size: 131 Bytes
Size of remote file: 520 kB

footer.html ADDED Viewed

	@@ -0,0 +1,4 @@

+<div class="footer">
+    <p>Model by <a href="https://github.com/PaddlePaddle/PaddleNLP" style="text-decoration: underline;" target="_blank">PaddleNLP</a> and Gradio Demo by 🤗 Hugging Face
+    </p>
+</div>

header.html ADDED Viewed

	@@ -0,0 +1,18 @@

+<div style="text-align: center; max-width: 650px; margin: 0 auto;">
+    <p align="center">
+      <img src="https://user-images.githubusercontent.com/1371212/175816733-8ec25eb0-9af3-4380-9218-27c154518258.png" align="middle"  width="500" />
+    </p>
+    <div
+      style="
+        display: inline-flex;
+        align-items: center;
+        gap: 0.8rem;
+        font-size: 1.75rem;
+        margin-bottom: 10px;
+        justify-content: center;
+      ">
+    <a href="https://github.com/PaddlePaddle/PaddleNLP"><h1 style="font-weight: 900; align-items: center; margin-bottom: 7px;">
+      UIE-X
+    </h1></a>
+    </div>
+  </div>

invoice.jpeg ADDED Viewed

Git LFS Details

SHA256: a3afad8c016954d8f5b1e79cc9209ca54318c860e0228a812d3e75805cd50f4b
Pointer size: 132 Bytes
Size of remote file: 2.83 MB

license.jpeg ADDED Viewed

Git LFS Details

SHA256: 3fd243446a474f8c7de06b92da796e6a36d0604b4c83d7c30c027a8d3525a766
Pointer size: 131 Bytes
Size of remote file: 102 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy==1.21.6
+opencv-python
+paddlenlp>=2.4.8
+paddleocr
+-f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+paddlepaddle-gpu==2.4.1.post112

resume.png ADDED Viewed

Git LFS Details

SHA256: 7be8498397a59f6aedf3cbee96041aea96b5d8f1aa667cf1d3ac5e93a7716734
Pointer size: 131 Bytes
Size of remote file: 191 kB

statements.png ADDED Viewed

Git LFS Details

SHA256: 5397dde321cc290817cf74fc264de04a2de396ab3ce0b5fd271d0ddfe6bce485
Pointer size: 130 Bytes
Size of remote file: 80.9 kB