import torch import streamlit as st from PIL import Image from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig # , DonutProcessor def demo_process(input_img): global pretrained_model, task_prompt, task_name # input_img = Image.fromarray(input_img) output = pretrained_model.inference(image=input_img, prompt=task_prompt)["predictions"][0] return output task_prompt = f"" st.text(''' This is OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset. The original 🍩 implementation can be found on: https://github.com/clovaai/donut ''') with st.sidebar: information = st.radio( "What information inside the are you interested in?", ('Receipt Summary', 'Receipt Menu Details', 'Extract all!')) receipt = st.selectbox('Pick one receipt', ['1', '2', '3', '4', '5', '6']) st.text(f'{information} mode is ON!\nTarget receipt: {receipt}\n(opening image @:./img/receipt-{receipt}.png)') image = Image.open(f"./img/receipt-{receipt}.png") st.image(image, caption='Your target receipt')