import os import json import base64 from io import BytesIO from PIL import Image import gradio as gr from inference import OcrReorderPipeline from transformers import AutoProcessor, LayoutLMv3Model, AutoTokenizer # 1) Load your model + tokenizer + processor as before repo = "Uddipan107/ocr-layoutlmv3-base-t5-small" model = LayoutLMv3Model.from_pretrained(repo) tokenizer = AutoTokenizer.from_pretrained(repo, subfolder="preprocessor") processor = AutoProcessor.from_pretrained(repo, subfolder="preprocessor", apply_ocr=False) pipe = OcrReorderPipeline(model, tokenizer, processor, device=0) def infer(image_path, json_file): # 2) Extract the filename user uploaded img_name = os.path.basename(image_path) # 3) Load the entire JSON; assume it’s a list of entries with open(json_file.name, "r", encoding="utf-8") as f: data = json.load(f) # 4) Find the entry matching this image entry = next((e for e in data if e["img_name"] == img_name), None) if entry is None: return f"❌ No JSON entry found for image '{img_name}'" words = entry["src_word_list"] boxes = entry["src_wordbox_list"] # 5) Read the image, encode to base64 for your pipeline img = Image.open(image_path).convert("RGB") buf = BytesIO(); img.save(buf, format="PNG") b64 = base64.b64encode(buf.getvalue()).decode() # 6) Call your pipeline and return the reordered text return pipe(b64, words, boxes)[0] demo = gr.Interface( fn=infer, inputs=[ # get the file path so we can match the filename gr.Image(type="filepath", label="Upload Image"), # this is the JSON file containing a list of entries gr.File(label="Upload JSON file") ], outputs="text", title="OCR Reorder (match image → JSON entry)" ) if __name__ == "__main__": demo.launch()