import json, base64 from io import BytesIO from PIL import Image import gradio as gr from inference import OcrReorderPipeline from transformers import ( AutoProcessor, LayoutLMv3Model, AutoTokenizer ) import torch # 1) Load from your model repo, pointing at the `preprocessor/` folder repo = "Uddipan107/ocr-layoutlmv3-base-t5-small" model = LayoutLMv3Model.from_pretrained(repo) tokenizer = AutoTokenizer.from_pretrained(repo, subfolder="preprocessor") processor = AutoProcessor.from_pretrained(repo, subfolder="preprocessor", apply_ocr=False) # 2) Instantiate your pipeline pipe = OcrReorderPipeline(model, tokenizer, processor, device=0) def infer(image, words_json, boxes_json): words = json.loads(words_json) boxes = json.loads(boxes_json) # Encode PIL image → PNG → base64 buf = BytesIO() image.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() # Run your custom pipeline and return the first (only) output string return pipe(b64, words, boxes)[0] # 3) Gradio UI demo = gr.Interface( fn=infer, inputs=[ gr.Image(type="pil", label="Image"), gr.Textbox(label="Words (JSON list)"), gr.Textbox(label="Boxes (JSON list)") ], outputs="text", title="OCR Reorder Pipeline" ) if __name__ == "__main__": demo.launch()