File size: 1,441 Bytes
6581de9
5279e45
6581de9
 
 
 
 
 
df73b43
6581de9
 
 
 
 
 
4bf6412
 
 
 
 
 
 
 
 
df73b43
4bf6412
64da888
1f662e3
f7fe7ff
df73b43
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import torch
import streamlit as st

from PIL import Image
from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig  # , DonutProcessor


def demo_process(input_img):
    global pretrained_model, task_prompt  # , task_name
    # input_img = Image.fromarray(input_img)
    output = pretrained_model.inference(image=input_img, prompt=task_prompt)["predictions"][0]
    return output

task_prompt = f"<s>"

st.text('''
This is OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset.
The original 🍩 implementation can be found on: https://github.com/clovaai/donut
''')

with st.sidebar:
    information = st.radio(
    "What information inside the are you interested in?",
    ('Receipt Summary', 'Receipt Menu Details', 'Extract all!'))
    receipt = st.selectbox('Pick one receipt', ['1', '2', '3', '4', '5', '6'], index='6')

st.text(f'{information} mode is ON!\nTarget receipt: {receipt}\n(opening image @:./img/receipt-{receipt}.png)')

image = Image.open(f"./img/receipt-{receipt}.jpg")
st.image(image, caption='Your target receipt')

st.text(f'baking the 🍩...')
pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructured/donut-base-sroie")
pretrained_model.encoder.to(torch.bfloat16)
pretrained_model.eval()

st.text(f'parsing receipt..')
parsed_receipt_info = demo_process(image)
st.text(f'\nRaw output:\n{parsed_receipt_info}')