Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,218 Bytes
67bff2d d58a265 67bff2d d58a265 67bff2d d58a265 67bff2d d58a265 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import io
import requests
import spaces
# Initialize model and processor globally for caching
model_id = "CohereForAI/aya-vision-8b"
processor = None
model = None
def load_model():
global processor, model
if processor is None or model is None:
try:
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(
model_id, device_map="auto", torch_dtype=torch.float16
)
return "Model loaded successfully!"
except Exception as e:
return f"Error loading model: {e}\nMake sure to install the correct version of transformers with: pip install 'git+https://github.com/huggingface/[email protected]'"
return "Model already loaded!"
@spaces.gpu
def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_tokens=300):
global processor, model
# Ensure model is loaded
if processor is None or model is None:
return "Please load the model first using the 'Load Model' button."
# Process image input (either uploaded or from URL)
if image is not None:
img = Image.fromarray(image)
elif image_url and image_url.strip():
try:
response = requests.get(image_url)
img = Image.open(io.BytesIO(response.content))
except Exception as e:
return f"Error loading image from URL: {e}"
else:
return "Please provide either an image or an image URL."
# Format message with the aya-vision chat template
messages = [
{"role": "user",
"content": [
{"type": "image", "source": img},
{"type": "text", "text": prompt},
]},
]
# Process input
try:
inputs = processor.apply_chat_template(
messages,
padding=True,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
# Generate response
gen_tokens = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
do_sample=True,
temperature=float(temperature),
)
response = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
return response
except Exception as e:
return f"Error generating response: {e}"
# Define example inputs
examples = [
[None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
[None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
[None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
]
# Create Gradio application
with gr.Blocks(title="Aya Vision 8B Demo") as demo:
gr.Markdown("# Aya Vision 8B Model Demo")
gr.Markdown("""
This app demonstrates the C4AI Aya Vision 8B model, an 8-billion parameter vision-language model with capabilities including:
- OCR (reading text from images)
- Image captioning
- Visual reasoning
- Question answering
- Support for 23 languages
Upload an image or provide a URL, and enter a prompt to get started!
""")
with gr.Row():
with gr.Column():
load_button = gr.Button("Load Model", variant="primary")
status = gr.Textbox(label="Model Status", placeholder="Model not loaded yet. Click 'Load Model' to start.")
gr.Markdown("### Upload an image or provide an image URL:")
with gr.Tab("Upload Image"):
image_input = gr.Image(label="Upload Image", type="numpy")
image_url_input = gr.Textbox(label="Image URL", placeholder="Leave blank if uploading an image", visible=False)
with gr.Tab("Image URL"):
image_url_visible = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
image_input_url = gr.Image(label="Upload Image", type="numpy", visible=False)
prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
with gr.Accordion("Generation Settings", open=False):
temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
generate_button = gr.Button("Generate Response", variant="primary")
with gr.Column():
output = gr.Textbox(label="Model Response", lines=10)
# Add examples section
gr.Markdown("### Examples")
gr.Examples(
examples=examples,
inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
outputs=output,
fn=process_image_and_prompt
)
# Set up tab switching logic - hide appropriate inputs depending on tab
def update_image_tab():
return {image_url_input: gr.update(visible=False), image_input: gr.update(visible=True)}
def update_url_tab():
return {image_url_visible: gr.update(visible=True), image_input_url: gr.update(visible=False)}
# Define button click behavior
load_button.click(load_model, inputs=None, outputs=status)
# Handle generation from either image or URL
def generate_response(image, image_url_visible, prompt, temperature, max_tokens):
return process_image_and_prompt(image, image_url_visible, prompt, temperature, max_tokens)
generate_button.click(
generate_response,
inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
outputs=output
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()
|