File size: 6,218 Bytes
67bff2d
 
d58a265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67bff2d
d58a265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67bff2d
d58a265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67bff2d
d58a265
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import io
import requests
import spaces

# Initialize model and processor globally for caching
model_id = "CohereForAI/aya-vision-8b"
processor = None
model = None

def load_model():
    global processor, model
    if processor is None or model is None:
        try:
            processor = AutoProcessor.from_pretrained(model_id)
            model = AutoModelForImageTextToText.from_pretrained(
                model_id, device_map="auto", torch_dtype=torch.float16
            )
            return "Model loaded successfully!"
        except Exception as e:
            return f"Error loading model: {e}\nMake sure to install the correct version of transformers with: pip install 'git+https://github.com/huggingface/[email protected]'"
    return "Model already loaded!"
@spaces.gpu
def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_tokens=300):
    global processor, model
    
    # Ensure model is loaded
    if processor is None or model is None:
        return "Please load the model first using the 'Load Model' button."
    
    # Process image input (either uploaded or from URL)
    if image is not None:
        img = Image.fromarray(image)
    elif image_url and image_url.strip():
        try:
            response = requests.get(image_url)
            img = Image.open(io.BytesIO(response.content))
        except Exception as e:
            return f"Error loading image from URL: {e}"
    else:
        return "Please provide either an image or an image URL."
    
    # Format message with the aya-vision chat template
    messages = [
        {"role": "user",
         "content": [
           {"type": "image", "source": img},
            {"type": "text", "text": prompt},
        ]},
    ]

    # Process input
    try:
        inputs = processor.apply_chat_template(
            messages, 
            padding=True, 
            add_generation_prompt=True, 
            tokenize=True, 
            return_dict=True, 
            return_tensors="pt"
        ).to(model.device)
        
        # Generate response
        gen_tokens = model.generate(
            **inputs, 
            max_new_tokens=int(max_tokens), 
            do_sample=True, 
            temperature=float(temperature),
        )

        response = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error generating response: {e}"

# Define example inputs
examples = [
    [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
    [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
    [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
]

# Create Gradio application
with gr.Blocks(title="Aya Vision 8B Demo") as demo:
    gr.Markdown("# Aya Vision 8B Model Demo")
    gr.Markdown("""
    This app demonstrates the C4AI Aya Vision 8B model, an 8-billion parameter vision-language model with capabilities including:
    - OCR (reading text from images)
    - Image captioning
    - Visual reasoning
    - Question answering
    - Support for 23 languages
    
    Upload an image or provide a URL, and enter a prompt to get started!
    """)
    
    with gr.Row():
        with gr.Column():
            load_button = gr.Button("Load Model", variant="primary")
            status = gr.Textbox(label="Model Status", placeholder="Model not loaded yet. Click 'Load Model' to start.")
            
            gr.Markdown("### Upload an image or provide an image URL:")
            with gr.Tab("Upload Image"):
                image_input = gr.Image(label="Upload Image", type="numpy")
                image_url_input = gr.Textbox(label="Image URL", placeholder="Leave blank if uploading an image", visible=False)
            
            with gr.Tab("Image URL"):
                image_url_visible = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
                image_input_url = gr.Image(label="Upload Image", type="numpy", visible=False)
                
            prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
            
            with gr.Accordion("Generation Settings", open=False):
                temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
                max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
            
            generate_button = gr.Button("Generate Response", variant="primary")
        
        with gr.Column():
            output = gr.Textbox(label="Model Response", lines=10)
    
    # Add examples section
    gr.Markdown("### Examples")
    gr.Examples(
        examples=examples,
        inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
        outputs=output,
        fn=process_image_and_prompt
    )

    # Set up tab switching logic - hide appropriate inputs depending on tab
    def update_image_tab():
        return {image_url_input: gr.update(visible=False), image_input: gr.update(visible=True)}
    
    def update_url_tab():
        return {image_url_visible: gr.update(visible=True), image_input_url: gr.update(visible=False)}
    
    # Define button click behavior
    load_button.click(load_model, inputs=None, outputs=status)
    
    # Handle generation from either image or URL
    def generate_response(image, image_url_visible, prompt, temperature, max_tokens):
        return process_image_and_prompt(image, image_url_visible, prompt, temperature, max_tokens)
    
    generate_button.click(
        generate_response,
        inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
        outputs=output
    )

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch()