Spaces:
Running
Running
# import re | |
# import gradio as gr | |
# import torch | |
# from transformers import DonutProcessor, VisionEncoderDecoderModel | |
# processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
# model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model.to(device) | |
# def process_document(image, question): | |
# # prepare encoder inputs | |
# pixel_values = processor(image, return_tensors="pt").pixel_values | |
# # prepare decoder inputs | |
# task_prompt = "{user_input}" | |
# prompt = task_prompt.replace("{user_input}", question) | |
# decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
# # generate answer | |
# outputs = model.generate( | |
# pixel_values.to(device), | |
# decoder_input_ids=decoder_input_ids.to(device), | |
# max_length=model.decoder.config.max_position_embeddings, | |
# early_stopping=True, | |
# pad_token_id=processor.tokenizer.pad_token_id, | |
# eos_token_id=processor.tokenizer.eos_token_id, | |
# use_cache=True, | |
# num_beams=1, | |
# bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
# return_dict_in_generate=True, | |
# ) | |
# # postprocess | |
# sequence = processor.batch_decode(outputs.sequences)[0] | |
# sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
# sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
# return processor.token2json(sequence) | |
# description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below." | |
# article = "<p style='text-align: center'>Model-V3</p>" | |
# demo = gr.Interface( | |
# fn=process_document, | |
# inputs=["image", "text"], | |
# outputs="json", | |
# title="Demo: Model-V3 for Document Analysis", | |
# description=description, | |
# article=article, | |
# examples=[["example_1.png", "What is the title shown?"], ["example_2.png", "When is mid semester exams?"]], | |
# cache_examples=False) | |
# demo.queue(max_size=5) | |
# demo.launch() | |
# import re | |
# import gradio as gr | |
# import torch | |
# from transformers import DonutProcessor, VisionEncoderDecoderModel | |
# import fitz # PyMuPDF | |
# from PIL import Image | |
# import io | |
# processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
# model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# model.to(device) | |
# def pdf_to_images(pdf_file): | |
# if pdf_file is None: | |
# return None | |
# pdf_path = pdf_file.name # Get the file path | |
# images = [] | |
# try: | |
# doc = fitz.open(pdf_path) | |
# for page in doc: | |
# pix = page.get_pixmap() | |
# img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
# images.append(img) | |
# return images | |
# except Exception as e: | |
# print(f"Error converting PDF: {e}") | |
# return None | |
# def process_document(pdf_file, page_number, question): | |
# if pdf_file is None: | |
# return "Please upload a PDF file." | |
# images = pdf_to_images(pdf_file) | |
# if images is None: | |
# return "Failed to process the PDF file." | |
# if page_number < 1 or page_number > len(images): | |
# return f"Invalid page number. The PDF has {len(images)} pages." | |
# image = images[page_number - 1] | |
# # prepare encoder inputs | |
# pixel_values = processor(image, return_tensors="pt").pixel_values | |
# # prepare decoder inputs | |
# task_prompt = "{user_input}" | |
# prompt = task_prompt.replace("{user_input}", question) | |
# decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
# # generate answer | |
# outputs = model.generate( | |
# pixel_values.to(device), | |
# decoder_input_ids=decoder_input_ids.to(device), | |
# max_length=model.decoder.config.max_position_embeddings, | |
# early_stopping=True, | |
# pad_token_id=processor.tokenizer.pad_token_id, | |
# eos_token_id=processor.tokenizer.eos_token_id, | |
# use_cache=True, | |
# num_beams=1, | |
# bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
# return_dict_in_generate=True, | |
# ) | |
# # postprocess | |
# sequence = processor.batch_decode(outputs.sequences)[0] | |
# sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
# sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
# return processor.token2json(sequence) | |
# def update_page_preview(pdf_file, page_number): | |
# if pdf_file is None: | |
# return None | |
# images = pdf_to_images(pdf_file) | |
# if images is None or page_number < 1 or page_number > len(images): | |
# return None | |
# return images[page_number - 1] | |
# # def update_page_slider(pdf_file): | |
# # if pdf_file is None: | |
# # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
# # images = pdf_to_images(pdf_file) | |
# # if images is None: | |
# # return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
# # return gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number") | |
# description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF file, select a page number, type a question, and click 'submit'." | |
# article = "<p style='text-align: center'>Model-V3</p>" | |
# with gr.Blocks() as demo: | |
# gr.Markdown("# Demo: Model-V3 for Document Analysis") | |
# gr.Markdown(description) | |
# with gr.Row(): | |
# with gr.Column(scale=1): | |
# pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"]) | |
# page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number") | |
# with gr.Column(scale=2): | |
# page_preview = gr.Image(label="Page Preview") | |
# question_input = gr.Textbox(label="Question") | |
# submit_button = gr.Button("Submit") | |
# output = gr.JSON(label="Output") | |
# def update_interface(pdf_file): | |
# if pdf_file is None: | |
# return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None | |
# images = pdf_to_images(pdf_file) | |
# if images is None: | |
# return gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number"), None | |
# return ( | |
# gr.Slider(minimum=1, maximum=len(images), value=1, step=1, label="Page Number"), | |
# images[0] # Show the first page by default | |
# ) | |
# pdf_input.change(update_interface, inputs=[pdf_input], outputs=[page_slider, page_preview]) | |
# page_slider.change(update_page_preview, inputs=[pdf_input, page_slider], outputs=[page_preview]) | |
# submit_button.click(process_document, inputs=[pdf_input, page_slider, question_input], outputs=[output]) | |
# demo.launch() | |
import re | |
import gradio as gr | |
import torch | |
from transformers import DonutProcessor, VisionEncoderDecoderModel | |
import fitz # PyMuPDF | |
from PIL import Image | |
import io | |
processor = DonutProcessor.from_pretrained("pacman2223/univ-docu-model-v3") | |
model = VisionEncoderDecoderModel.from_pretrained("pacman2223/univ-docu-model-v3") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model.to(device) | |
def pdf_to_images(pdf_file): | |
if pdf_file is None: | |
return None | |
pdf_path = pdf_file.name # Get the file path | |
images = [] | |
try: | |
doc = fitz.open(pdf_path) | |
for page in doc: | |
pix = page.get_pixmap() | |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
images.append(img) | |
return images | |
except Exception as e: | |
print(f"Error converting PDF: {e}") | |
return None | |
def process_document(file, page_number, question, input_type): | |
if file is None: | |
return "Please upload a file." | |
if input_type == "PDF": | |
images = pdf_to_images(file) | |
if images is None: | |
return "Failed to process the PDF file." | |
if page_number < 1 or page_number > len(images): | |
return f"Invalid page number. The PDF has {len(images)} pages." | |
image = images[page_number - 1] | |
else: # Image | |
image = Image.open(file.name) | |
# prepare encoder inputs | |
pixel_values = processor(image, return_tensors="pt").pixel_values | |
# prepare decoder inputs | |
task_prompt = "{user_input}" | |
prompt = task_prompt.replace("{user_input}", question) | |
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
# generate answer | |
outputs = model.generate( | |
pixel_values.to(device), | |
decoder_input_ids=decoder_input_ids.to(device), | |
max_length=model.decoder.config.max_position_embeddings, | |
early_stopping=True, | |
pad_token_id=processor.tokenizer.pad_token_id, | |
eos_token_id=processor.tokenizer.eos_token_id, | |
use_cache=True, | |
num_beams=1, | |
bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
return_dict_in_generate=True, | |
) | |
# postprocess | |
sequence = processor.batch_decode(outputs.sequences)[0] | |
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token | |
return processor.token2json(sequence) | |
def update_page_preview(file, page_number, input_type): | |
if file is None: | |
return None | |
if input_type == "PDF": | |
images = pdf_to_images(file) | |
if images is None or page_number < 1 or page_number > len(images): | |
return None | |
return images[page_number - 1] | |
else: # Image | |
return Image.open(file.name) | |
description = "Gradio Demo for Model-V3, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, upload a PDF or image file, select a page number (for PDF), type a question, and click 'submit'." | |
article = "<p style='text-align: center'>Model-V3</p>" | |
with gr.Blocks() as demo: | |
gr.Markdown("# Demo: Model-V3 for Document Analysis") | |
gr.Markdown(description) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
input_type = gr.Radio(["PDF", "Image"], label="Input Type", value="PDF") | |
file_input = gr.File(label="Upload File") | |
page_slider = gr.Slider(minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)") | |
with gr.Column(scale=2): | |
page_preview = gr.Image(label="Page/Image Preview") | |
question_input = gr.Textbox(label="Question") | |
submit_button = gr.Button("Submit") | |
output = gr.JSON(label="Output") | |
def update_interface(file, input_type): | |
if file is None: | |
return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None | |
if input_type == "PDF": | |
images = pdf_to_images(file) | |
if images is None: | |
return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), None | |
return ( | |
gr.Slider(visible=True, minimum=1, maximum=len(images), value=1, step=1, label="Page Number (PDF only)"), | |
images[0] # Show the first page by default | |
) | |
else: # Image | |
return gr.Slider(visible=False, minimum=1, maximum=1, value=1, step=1, label="Page Number (PDF only)"), Image.open(file.name) | |
input_type.change(lambda x: gr.File(label="Upload File", file_types=[".pdf"] if x == "PDF" else ["image/*"]), inputs=[input_type], outputs=[file_input]) | |
file_input.change(update_interface, inputs=[file_input, input_type], outputs=[page_slider, page_preview]) | |
page_slider.change(update_page_preview, inputs=[file_input, page_slider, input_type], outputs=[page_preview]) | |
submit_button.click(process_document, inputs=[file_input, page_slider, question_input, input_type], outputs=[output]) | |
demo.launch() |