import os import requests from PIL import Image, ImageDraw from unittest.mock import patch import gradio as gr import ast from transformers import AutoModelForCausalLM, AutoProcessor from transformers.dynamic_module_utils import get_imports def fixed_get_imports(filename: str | os.PathLike) -> list[str]: """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72.""" if not str(filename).endswith("/modeling_florence2.py"): return get_imports(filename) imports = get_imports(filename) imports.remove("flash_attn") return imports with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports): model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True) processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large-ft", trust_remote_code=True) def draw_boxes(image, quad_boxes): draw = ImageDraw.Draw(image) for box in quad_boxes: draw.polygon(box, outline="red", width=2) return image def run_example(image, task, additional_text=""): if image is None: return "Please upload an image.", None prompt = f"<{task}>" if task == "CAPTION_TO_PHRASE_GROUNDING" and additional_text: inputs = processor(text=prompt, images=image, return_tensors="pt", text_input=additional_text) else: inputs = processor(text=prompt, images=image, return_tensors="pt") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height)) result_text = str(parsed_answer) result_image = image.copy() if task == "OCR_WITH_REGION": try: result_dict = ast.literal_eval(result_text) quad_boxes = result_dict['']['quad_boxes'] result_image = draw_boxes(result_image, quad_boxes) except: print("Failed to draw bounding boxes.") return result_text, result_image def update_additional_text_visibility(task): return gr.update(visible=(task == "CAPTION_TO_PHRASE_GROUNDING")) # Define the Gradio interface with gr.Blocks() as iface: gr.Markdown("# Florence-2 Image Analysis") with gr.Row(): image_input = gr.Image(type="pil", label="Upload an image") with gr.Column(): task_dropdown = gr.Dropdown( choices=[ "CAPTION", "DETAILED_CAPTION", "MORE_DETAILED_CAPTION", "CAPTION_TO_PHRASE_GROUNDING", "OD", "DENSE_REGION_CAPTION", "REGION_PROPOSAL", "OCR", "OCR_WITH_REGION" ], label="Select Task", value="CAPTION" ) additional_text = gr.Textbox( label="Additional Text (for Caption to Phrase Grounding)", placeholder="Enter caption here", visible=False ) submit_button = gr.Button("Analyze Image") with gr.Row(): text_output = gr.Textbox(label="Result") image_output = gr.Image(label="Processed Image") task_dropdown.change(fn=update_additional_text_visibility, inputs=task_dropdown, outputs=additional_text) submit_button.click( fn=run_example, inputs=[image_input, task_dropdown, additional_text], outputs=[text_output, image_output] ) # Launch the interface iface.launch()