File size: 4,725 Bytes
3a77e9a
 
 
 
 
 
a658528
 
 
 
 
 
 
 
 
3a77e9a
 
a658528
3a77e9a
 
 
 
 
 
a658528
3a77e9a
 
 
 
 
a658528
3a77e9a
 
 
 
 
 
 
 
 
a658528
3a77e9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fcc557
3a77e9a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import subprocess
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# Upgrade transformers to the latest version
try:
    subprocess.run('pip install --upgrade transformers', check=True, shell=True)
    print("Successfully upgraded transformers.")
except subprocess.CalledProcessError as e:
    print(f"Error upgrading transformers: {e}")
    print("Continuing with the current version, but this may cause issues.")

# Attempt to install flash-attn (optional, for performance)
try:
    subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, check=True, shell=True)
    print("Successfully installed flash-attn.")
except subprocess.CalledProcessError as e:
    print(f"Error installing flash-attn: {e}")
    print("Continuing without flash-attn.")

# Determine the device to use
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the base model and processor
try:
    vision_language_model_base = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True).to(device).eval()
    vision_language_processor_base = AutoProcessor.from_pretrained('microsoft/Florence-2-base', trust_remote_code=True)
    print("Base model and processor loaded successfully.")
except Exception as e:
    print(f"Error loading base model: {e}")
    vision_language_model_base = None
    vision_language_processor_base = None

# Load the large model and processor
try:
    vision_language_model_large = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True).to(device).eval()
    vision_language_processor_large = AutoProcessor.from_pretrained('microsoft/Florence-2-large', trust_remote_code=True)
    print("Large model and processor loaded successfully.")
except Exception as e:
    print(f"Error loading large model: {e}")
    vision_language_model_large = None
    vision_language_processor_large = None

def describe_image(uploaded_image, model_choice):
    """
    Generates a detailed description of the input image using the selected model.

    Args:
        uploaded_image (PIL.Image.Image): The image to describe.
        model_choice (str): The model to use, either "Base" or "Large".

    Returns:
        str: A detailed textual description of the image or an error message.
    """
    if uploaded_image is None:
        return "Please upload an image."

    if model_choice == "Base":
        if vision_language_model_base is None:
            return "Base model failed to load."
        model = vision_language_model_base
        processor = vision_language_processor_base
    elif model_choice == "Large":
        if vision_language_model_large is None:
            return "Large model failed to load."
        model = vision_language_model_large
        processor = vision_language_processor_large
    else:
        return "Invalid model choice."

    if not isinstance(uploaded_image, Image.Image):
        uploaded_image = Image.fromarray(uploaded_image)

    inputs = processor(text="<MORE_DETAILED_CAPTION>", images=uploaded_image, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    processed_description = processor.post_process_generation(
        generated_text,
        task="<MORE_DETAILED_CAPTION>",
        image_size=(uploaded_image.width, uploaded_image.height)
    )
    image_description = processed_description["<MORE_DETAILED_CAPTION>"]
    print("\nImage description generated!:", image_description)
    return image_description

# Description for the interface
description = "Select the model to use for generating the image description. 'Base' is smaller and faster, while 'Large' is more accurate but slower."
if device == "cpu":
    description += " Note: Running on CPU, which may be slow for large models."

# Create the Gradio interface
image_description_interface = gr.Interface(
    fn=describe_image,
    inputs=[
        gr.Image(label="Upload Image", type="pil"),
        gr.Radio(["Base", "Large"], label="Model Choice", value="Base")
    ],
    outputs=gr.Textbox(label="Generated Caption", lines=4, show_copy_button=True),
    live=False,
    title="Florence-2 Models Image Captions",
    description=description
)

# Launch the interface
image_description_interface.launch(debug=True, ssr_mode=False)