cxeep linjieccc commited on
Commit
bb7a774
·
0 Parent(s):

Duplicate from PaddlePaddle/UIE-X

Browse files

Co-authored-by: Linjie Chen <[email protected]>

Files changed (12) hide show
  1. .gitattributes +37 -0
  2. README.md +14 -0
  3. app.py +378 -0
  4. business_card.png +3 -0
  5. custom.jpeg +3 -0
  6. footer.html +4 -0
  7. header.html +18 -0
  8. invoice.jpeg +3 -0
  9. license.jpeg +3 -0
  10. requirements.txt +6 -0
  11. resume.png +3 -0
  12. statements.png +3 -0
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ *.psd filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: UIE-X
3
+ emoji: 📄
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.4.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: PaddlePaddle/UIE-X
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #-*- coding: UTF-8 -*-
2
+ # Copyright 2022 the HuggingFace Team.
3
+ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ import traceback
18
+ import base64
19
+
20
+ import gradio as gr
21
+ import cv2
22
+
23
+ from paddlenlp import Taskflow
24
+ from paddlenlp.utils.doc_parser import DocParser
25
+
26
+
27
+ doc_parser = DocParser()
28
+ task_instance = Taskflow(
29
+ "information_extraction",
30
+ model="uie-x-base",
31
+ task_path="PaddlePaddle/uie-x-base",
32
+ from_hf_hub=True)
33
+
34
+ examples = [
35
+ [
36
+ "business_card.png",
37
+ "Name;Title;Web Link;Email;Address",
38
+ ],
39
+ [
40
+ "license.jpeg",
41
+ "Name;DOB;ISS;EXP",
42
+ ],
43
+ [
44
+ "statements.png",
45
+ "Date|Gross profit",
46
+ ],
47
+ [
48
+ "invoice.jpeg",
49
+ "名称;纳税人识别号;开票日期",
50
+ ],
51
+ [
52
+ "custom.jpeg",
53
+ "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号"
54
+ ],
55
+ [
56
+ "resume.png",
57
+ "职位;年龄;学校|时间;学校|专业",
58
+ ],
59
+ ]
60
+
61
+ example_files = {
62
+ "Name;Title;Web Link;Email;Address": "business_card.png",
63
+ "Name;DOB;ISS;EXP": "license.jpeg",
64
+ "Date|Gross profit": "statements.png",
65
+ "职位;年龄;学校|时间;学校|专业": "resume.png",
66
+ "收发货人;进口口岸;进口日期;运输方式;征免性质;境内目的地;运输工具名称;包装种类;件数;合同协议号": "custom.jpeg",
67
+ "名称;纳税人识别号;开票日期": "invoice.jpeg",
68
+ }
69
+
70
+ lang_map = {
71
+ "resume.png": "ch",
72
+ "custom.jpeg": "ch",
73
+ "business_card.png": "en",
74
+ "invoice.jpeg": "ch",
75
+ "license.jpeg": "en",
76
+ "statements.png": "en",
77
+ }
78
+
79
+ def dbc2sbc(s):
80
+ rs = ""
81
+ for char in s:
82
+ code = ord(char)
83
+ if code == 0x3000:
84
+ code = 0x0020
85
+ else:
86
+ code -= 0xfee0
87
+ if not (0x0021 <= code and code <= 0x7e):
88
+ rs += char
89
+ continue
90
+ rs += chr(code)
91
+ return rs
92
+
93
+
94
+ def np2base64(image_np):
95
+ image = cv2.imencode('.jpg', image_np)[1]
96
+ base64_str = str(base64.b64encode(image))[2:-1]
97
+ return base64_str
98
+
99
+
100
+ def process_path(path):
101
+ error = None
102
+ if path:
103
+ try:
104
+ if path.endswith(".pdf"):
105
+ images_list = [doc_parser.read_pdf(path)]
106
+ else:
107
+ images_list = [doc_parser.read_image(path)]
108
+ return (
109
+ path,
110
+ gr.update(visible=True, value=images_list),
111
+ gr.update(visible=True),
112
+ gr.update(visible=False, value=None),
113
+ gr.update(visible=False, value=None),
114
+ None,
115
+ )
116
+ except Exception as e:
117
+ traceback.print_exc()
118
+ error = str(e)
119
+ return (
120
+ None,
121
+ gr.update(visible=False, value=None),
122
+ gr.update(visible=False),
123
+ gr.update(visible=False, value=None),
124
+ gr.update(visible=False, value=None),
125
+ gr.update(visible=True, value=error) if error is not None else None,
126
+ None,
127
+ )
128
+
129
+
130
+ def process_upload(file):
131
+ if file:
132
+ return process_path(file.name)
133
+ else:
134
+ return (
135
+ None,
136
+ gr.update(visible=False, value=None),
137
+ gr.update(visible=False),
138
+ gr.update(visible=False, value=None),
139
+ gr.update(visible=False, value=None),
140
+ None,
141
+ )
142
+
143
+ def get_schema(schema_str):
144
+ def _is_ch(s):
145
+ for ch in s:
146
+ if "\u4e00" <= ch <= "\u9fff":
147
+ return True
148
+ return False
149
+ schema_lang = "ch" if _is_ch(schema_str) else "en"
150
+ schema = schema_str.split(";")
151
+ schema_list = []
152
+ for s in schema:
153
+ cand = s.split("|")
154
+ if len(cand) == 1:
155
+ schema_list.append(cand[0])
156
+ else:
157
+ subject = cand[0]
158
+ relations = cand[1:]
159
+ added = False
160
+ for a in schema_list:
161
+ if isinstance(a, dict):
162
+ if subject in a.keys():
163
+ a[subject].extend(relations)
164
+ added = True
165
+ break
166
+ if not added:
167
+ a = {subject: relations}
168
+ schema_list.append(a)
169
+ return schema_list, schema_lang
170
+
171
+
172
+ def run_taskflow(document, schema, argument):
173
+ task_instance.set_schema(schema)
174
+ task_instance.set_argument(argument)
175
+ return task_instance({'doc': document})
176
+
177
+
178
+ def process_doc(document, schema, ocr_lang, layout_analysis):
179
+ if [document, schema] in examples:
180
+ ocr_lang = lang_map[document]
181
+
182
+ if not schema:
183
+ schema = '时间;组织机构;人物'
184
+ if document is None:
185
+ return None, None
186
+
187
+ layout_analysis = True if layout_analysis == "yes" else False
188
+ schema, schema_lang = get_schema(dbc2sbc(schema))
189
+ argument = {
190
+ "ocr_lang": ocr_lang,
191
+ "schema_lang": schema_lang,
192
+ "layout_analysis": layout_analysis
193
+ }
194
+ prediction = run_taskflow(document, schema, argument)[0]
195
+
196
+ if document.endswith(".pdf"):
197
+ _image = doc_parser.read_pdf(document)
198
+ else:
199
+ _image = doc_parser.read_image(document)
200
+
201
+ img_show = doc_parser.write_image_with_results(
202
+ np2base64(_image),
203
+ result=prediction,
204
+ return_image=True)
205
+ img_list = [img_show]
206
+
207
+ return (
208
+ gr.update(visible=True, value=img_list),
209
+ gr.update(visible=True, value=prediction),
210
+ )
211
+
212
+
213
+ def load_example_document(img, schema, ocr_lang, layout_analysis):
214
+ if img is not None:
215
+ document = example_files[schema]
216
+ choice = lang_map[document].split("-")
217
+ ocr_lang = choice[0]
218
+ preview, answer = process_doc(document, schema, ocr_lang, layout_analysis)
219
+ return document, schema, preview, gr.update(visible=True), answer
220
+ else:
221
+ return None, None, None, gr.update(visible=False), None
222
+
223
+
224
+ def read_content(file_path: str) -> str:
225
+ """read the content of target file
226
+ """
227
+ with open(file_path, 'r', encoding='utf-8') as f:
228
+ content = f.read()
229
+
230
+ return content
231
+
232
+
233
+ with gr.Blocks() as demo:
234
+ gr.HTML(read_content("header.html"))
235
+ gr.Markdown(
236
+ "Open-sourced by [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP), **UIE-X** is a universal information extraction engine for both scanned document and text inputs. It supports Entity Extraction, Relation Extraction and Event Extraction tasks. "
237
+ "UIE-X performs well on a zero-shot settings, which is enabled by a flexible schema that allows you to specify extraction targets with simple natural language. "
238
+ "Moreover, on [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP), we provide a comprehensive and easy-to-use fine-tuning and few-shot customization workflow. <br>"
239
+ "Want to dive deeper? Check out our [AIStudio Notebook](https://aistudio.baidu.com/aistudio/projectdetail/5261592) and [Colab Notebook](https://colab.research.google.com/drive/1ZY_ELZgoemJNoa6baWpgtzebLgoCT8MK?usp=sharing). "
240
+ "For more details, please visit our [GitHub](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/README_en.md)"
241
+ )
242
+
243
+ document = gr.Variable()
244
+ is_text = gr.Variable()
245
+ example_schema = gr.Textbox(visible=False)
246
+ example_image = gr.Image(visible=False)
247
+ with gr.Row(equal_height=True):
248
+ with gr.Column():
249
+ with gr.Row():
250
+ gr.Markdown("## 1. Select a file 选择文件", elem_id="select-a-file")
251
+ img_clear_button = gr.Button(
252
+ "Clear", variant="secondary", elem_id="file-clear", visible=False
253
+ )
254
+ image = gr.Gallery(visible=False)
255
+ with gr.Row(equal_height=True):
256
+ with gr.Column():
257
+ with gr.Row():
258
+ url = gr.Textbox(
259
+ show_label=False,
260
+ placeholder="URL",
261
+ lines=1,
262
+ max_lines=1,
263
+ elem_id="url-textbox",
264
+ )
265
+ submit = gr.Button("Get")
266
+ url_error = gr.Textbox(
267
+ visible=False,
268
+ elem_id="url-error",
269
+ max_lines=1,
270
+ interactive=False,
271
+ label="Error",
272
+ )
273
+ gr.Markdown("## <center> — or — </center>")
274
+ upload = gr.File(label=None, interactive=True, elem_id="short-upload-box")
275
+ gr.Examples(
276
+ examples=examples,
277
+ inputs=[example_image, example_schema],
278
+ )
279
+
280
+ with gr.Column():
281
+ gr.Markdown("## 2. Information Extraction 信息抽取 ")
282
+ gr.Markdown("### 👉 Set a schema 设置schema")
283
+ gr.Markdown("Entity extraction: entity type should be separated by ';', e.g. **Person;Organization**")
284
+ gr.Markdown("实体抽取:实体类别之间以';'分割,例如 **人物;组织机构**")
285
+ gr.Markdown("Relation extraction: set the subject and relation type, separated by '|', e.g. **Person|Date;Person|Email**")
286
+ gr.Markdown("关系抽取:需配置主体和关系类别,中间以'|'分割,例如 **人物|出生时间;人物|邮箱**")
287
+ gr.Markdown("### 👉 Model customization 模型定制")
288
+ gr.Markdown("We recommend to further improve the extraction performance in specific domain through the process of [data annotation & fine-tuning](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)")
289
+ gr.Markdown("我们建议通过[数据标注+微调](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/applications/information_extraction/document/README_en.md)的流程进一步增强模型在特定场景的效果")
290
+
291
+ schema = gr.Textbox(
292
+ label="Schema",
293
+ placeholder="e.g. Name|Company;Name|Position;Email;Phone Number",
294
+ lines=1,
295
+ max_lines=1,
296
+ )
297
+
298
+ ocr_lang = gr.Radio(
299
+ choices=["ch", "en"],
300
+ value="en",
301
+ label="OCR语言 / OCR Language (Please choose ch for Chinese images.)",
302
+ )
303
+
304
+ layout_analysis = gr.Radio(
305
+ choices=["yes", "no"],
306
+ value="no",
307
+ label="版面分析 / Layout analysis (Better extraction for multi-line text)",
308
+ )
309
+
310
+ with gr.Row():
311
+ clear_button = gr.Button("Clear", variant="secondary")
312
+ submit_button = gr.Button(
313
+ "Submit", variant="primary", elem_id="submit-button"
314
+ )
315
+ with gr.Column():
316
+ output = gr.JSON(label="Output", visible=False)
317
+
318
+ for cb in [img_clear_button, clear_button]:
319
+ cb.click(
320
+ lambda _: (
321
+ gr.update(visible=False, value=None),
322
+ None,
323
+ gr.update(visible=False, value=None),
324
+ gr.update(visible=False),
325
+ None,
326
+ None,
327
+ None,
328
+ gr.update(visible=False, value=None),
329
+ None,
330
+ ),
331
+ inputs=clear_button,
332
+ outputs=[
333
+ image,
334
+ document,
335
+ output,
336
+ img_clear_button,
337
+ example_image,
338
+ upload,
339
+ url,
340
+ url_error,
341
+ schema,
342
+ ],
343
+ )
344
+
345
+ upload.change(
346
+ fn=process_upload,
347
+ inputs=[upload],
348
+ outputs=[document, image, img_clear_button, output, url_error],
349
+ )
350
+ submit.click(
351
+ fn=process_path,
352
+ inputs=[url],
353
+ outputs=[document, image, img_clear_button, output, url_error],
354
+ )
355
+
356
+ schema.submit(
357
+ fn=process_doc,
358
+ inputs=[document, schema, ocr_lang, layout_analysis],
359
+ outputs=[image, output],
360
+ )
361
+
362
+ submit_button.click(
363
+ fn=process_doc,
364
+ inputs=[document, schema, ocr_lang, layout_analysis],
365
+ outputs=[image, output],
366
+ )
367
+
368
+ example_image.change(
369
+ fn=load_example_document,
370
+ inputs=[example_image, example_schema, ocr_lang, layout_analysis],
371
+ outputs=[document, schema, image, img_clear_button, output],
372
+ )
373
+
374
+ gr.HTML(read_content("footer.html"))
375
+
376
+
377
+ if __name__ == "__main__":
378
+ demo.queue().launch()
business_card.png ADDED

Git LFS Details

  • SHA256: 68aa93a2b4122a517fac752507a4c65218fbaccbf16385afec02dbac0ecdbbdc
  • Pointer size: 131 Bytes
  • Size of remote file: 313 kB
custom.jpeg ADDED

Git LFS Details

  • SHA256: b0d83ab6cac4747e00192474a2e8636285bddfcab884c4083ad30c6284f13b10
  • Pointer size: 131 Bytes
  • Size of remote file: 520 kB
footer.html ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <div class="footer">
2
+ <p>Model by <a href="https://github.com/PaddlePaddle/PaddleNLP" style="text-decoration: underline;" target="_blank">PaddleNLP</a> and Gradio Demo by 🤗 Hugging Face
3
+ </p>
4
+ </div>
header.html ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div style="text-align: center; max-width: 650px; margin: 0 auto;">
2
+ <p align="center">
3
+ <img src="https://user-images.githubusercontent.com/1371212/175816733-8ec25eb0-9af3-4380-9218-27c154518258.png" align="middle" width="500" />
4
+ </p>
5
+ <div
6
+ style="
7
+ display: inline-flex;
8
+ align-items: center;
9
+ gap: 0.8rem;
10
+ font-size: 1.75rem;
11
+ margin-bottom: 10px;
12
+ justify-content: center;
13
+ ">
14
+ <a href="https://github.com/PaddlePaddle/PaddleNLP"><h1 style="font-weight: 900; align-items: center; margin-bottom: 7px;">
15
+ UIE-X
16
+ </h1></a>
17
+ </div>
18
+ </div>
invoice.jpeg ADDED

Git LFS Details

  • SHA256: a3afad8c016954d8f5b1e79cc9209ca54318c860e0228a812d3e75805cd50f4b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.83 MB
license.jpeg ADDED

Git LFS Details

  • SHA256: 3fd243446a474f8c7de06b92da796e6a36d0604b4c83d7c30c027a8d3525a766
  • Pointer size: 131 Bytes
  • Size of remote file: 102 kB
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy==1.21.6
2
+ opencv-python
3
+ paddlenlp>=2.4.8
4
+ paddleocr
5
+ -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
6
+ paddlepaddle-gpu==2.4.1.post112
resume.png ADDED

Git LFS Details

  • SHA256: 7be8498397a59f6aedf3cbee96041aea96b5d8f1aa667cf1d3ac5e93a7716734
  • Pointer size: 131 Bytes
  • Size of remote file: 191 kB
statements.png ADDED

Git LFS Details

  • SHA256: 5397dde321cc290817cf74fc264de04a2de396ab3ce0b5fd271d0ddfe6bce485
  • Pointer size: 130 Bytes
  • Size of remote file: 80.9 kB