NiamaLynn commited on
Commit
e01c044
·
1 Parent(s): 0cdb2b1

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -176
app.py DELETED
@@ -1,176 +0,0 @@
1
- import os
2
- import gradio as gr
3
- import re
4
- import string
5
-
6
- from operator import itemgetter
7
- import collections
8
-
9
- import pypdf
10
- from pypdf import PdfReader
11
- from pypdf.errors import PdfReadError
12
-
13
- import pdf2image
14
- from pdf2image import convert_from_path
15
- import langdetect
16
- from langdetect import detect_langs
17
-
18
- import pandas as pd
19
- import numpy as np
20
- import random
21
- import tempfile
22
- import itertools
23
-
24
- from matplotlib import font_manager
25
- from PIL import Image, ImageDraw, ImageFont
26
- import cv2
27
- import os
28
- import gradio as gr
29
- import re
30
- import string
31
-
32
- from operator import itemgetter
33
- import collections
34
-
35
- import pdf2image
36
- import langdetect
37
- import pandas as pd
38
- import numpy as np
39
- import random
40
- import tempfile
41
- import itertools
42
- ## files
43
-
44
- import sys
45
- sys.path.insert(0, 'files/')
46
-
47
- import functions
48
- from functions import *
49
-
50
- # update pip
51
- os.system('python -m pip install --upgrade pip')
52
-
53
- # model
54
- from transformers import AutoTokenizer, AutoModelForTokenClassification
55
-
56
- import torch
57
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
-
59
- model_id = "NiamaLynn/lilt-roberta-DocLayNet-base_lines_ml256-v1"
60
-
61
- tokenizer = AutoTokenizer.from_pretrained(model_id)
62
- model = AutoModelForTokenClassification.from_pretrained(model_id);
63
- model.to(device);
64
-
65
- # APP outputs
66
- def app_outputs(uploaded_pdf):
67
- filename, msg, images = pdf_to_images(uploaded_pdf)
68
- num_images = len(images)
69
-
70
- if not msg.startswith("Error with the PDF"):
71
-
72
- # Extraction of image data (text and bounding boxes)
73
- dataset, lines, row_indexes, par_boxes, line_boxes = extraction_data_from_image(images)
74
- # prepare our data in the format of the model
75
- encoded_dataset = dataset.map(prepare_inference_features, batched=True, batch_size=64, remove_columns=dataset.column_names)
76
- custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
77
- # Get predictions (token level)
78
- outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
79
- # Get predictions (line level)
80
- probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
81
- # Get labeled images with lines bounding boxes
82
- images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
83
-
84
- img_files = list()
85
- # Save images without bounding boxes
86
- for i in range(num_images):
87
- if filename != "files/blank.png":
88
- img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
89
- else:
90
- img_file = filename.replace(".pdf", ".png")
91
- # Save images (replace PIL-related code)
92
- save_image(images[i], img_file)
93
- img_files.append(img_file)
94
-
95
- # save
96
- csv_files = list()
97
- for i in range(max_imgboxes):
98
- csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
99
- csv_files.append(gr.File.update(value=csv_file, visible=True))
100
- df[i].to_csv(csv_file, encoding="utf-8", index=False)
101
-
102
- else:
103
- img_files, images, csv_files = [""] * max_imgboxes, [""] * max_imgboxes, [""] * max_imgboxes
104
- img_files[0], img_files[1] = image_blank, image_blank
105
- images[0], images[1] = open_image(image_blank), open_image(image_blank)
106
- csv_file = "csv_wo_content.csv"
107
- csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
108
- df, df_empty = dict(), pd.DataFrame()
109
- df[0], df[1] = save_csv(df_empty, csv_file), save_csv(df_empty, csv_file)
110
-
111
- return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
112
-
113
-
114
- # gradio APP
115
- with gr.Blocks(title="Inference APP for Document Understanding at line level (v1 - LiLT base)", css=".gradio-container") as demo:
116
- gr.HTML("""
117
- <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at line level (v1 - LiLT base)</h1></div>
118
- <div style="margin-top: 40px"><p>(02/12/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-linelevel-ml384" target="_blank">model LiLT base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at line level</a> (chunk size of 384 tokens).</p></div>
119
- <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2202.13669" target="_blank">LiLT (Language-Independent Layout Transformer)</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
120
- <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run LiLT (already fine-tuned on the dataset DocLayNet base at line level) on the individual tokens and then, visualize the result at line level!</p></div>
121
- <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at line level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
122
- <div><p>However, the inference time per page can be high when running the model on CPU due to the number of line predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb" target="_blank">Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div><div style="margin-top: 20px"><p>Links to Document Understanding APPs:</p><ul><li>Line level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">v2 (LayoutXLM base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-LiLT-base-LayoutXLM-base-v1" target="_blank">v1 (LilT base vs LayoutXLM base)</a></li><li>Paragraph level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v1 (LiLT base)</a></li></ul></div><div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p><ul><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
123
- """)
124
- with gr.Row():
125
- pdf_file = gr.File(label="PDF")
126
- with gr.Row():
127
- submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
128
- reset_btn = gr.Button(value="Clear")
129
- with gr.Row():
130
- output_msg = gr.Textbox(label="Output message")
131
- with gr.Row():
132
- fileboxes = []
133
- for num_page in range(max_imgboxes):
134
- file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
135
- fileboxes.append(file_path)
136
- with gr.Row():
137
- imgboxes = []
138
- for num_page in range(max_imgboxes):
139
- img = gr.Image(label=f"Image of the PDF page n°{num_page}")
140
- imgboxes.append(img)
141
- with gr.Row():
142
- csvboxes = []
143
- for num_page in range(max_imgboxes):
144
- csv = gr.File(visible=True, label=f"CSV file at line level (page {num_page})")
145
- csvboxes.append(csv)
146
- with gr.Row():
147
- dfboxes = []
148
- for num_page in range(max_imgboxes):
149
- df = gr.Dataframe(
150
- headers=["bounding boxes", "texts", "labels"],
151
- datatype=["str", "str", "str"],
152
- col_count=(3, "fixed"),
153
- visible=True,
154
- label=f"Data of page {num_page}",
155
- type="pandas",
156
- wrap=True
157
- )
158
- dfboxes.append(df)
159
-
160
- outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
161
- submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
162
- reset_btn.click(
163
- lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
164
- inputs=[],
165
- outputs=[pdf_file, output_msg] + fileboxes + csvboxes + dfboxes,
166
- )
167
-
168
- gr.Examples(
169
- [["files/example.pdf"]],
170
- [pdf_file],
171
- outputboxes,
172
- fn=app_outputs,
173
- cache_examples=True,
174
- )
175
-
176
- demo.launch()