mjavaid commited on
Commit
28691d0
·
1 Parent(s): ed11a3f

first commit

Browse files
Files changed (1) hide show
  1. app.py +39 -44
app.py CHANGED
@@ -2,15 +2,13 @@ import spaces
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
5
- from PIL import Image
6
- import io
7
  import requests
8
  import os
9
 
10
  hf_token = os.environ.get("HF_TOKEN")
11
  model_id = "CohereForAI/aya-vision-8b"
12
 
13
- # Load model and processor immediately on startup.
14
  try:
15
  processor = AutoProcessor.from_pretrained(model_id)
16
  model = AutoModelForImageTextToText.from_pretrained(
@@ -26,36 +24,34 @@ except Exception as e:
26
  )
27
 
28
  @spaces.GPU
29
- def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_tokens=300):
30
  global processor, model
31
 
32
- # Check if the model is loaded
33
  if processor is None or model is None:
34
  return "Model failed to load. Please check the logs."
35
-
36
- # Process image input (either uploaded or from URL)
37
- if image is not None:
38
- img = Image.fromarray(image)
 
 
 
39
  elif image_url and image_url.strip():
40
- try:
41
- response = requests.get(image_url)
42
- img = Image.open(io.BytesIO(response.content))
43
- except Exception as e:
44
- return f"Error loading image from URL: {e}"
45
  else:
46
- return "Please provide either an image or an image URL."
47
-
48
- # Format the message using the Aya Vision chat template
49
  messages = [
50
  {
51
  "role": "user",
52
  "content": [
53
- {"type": "image", "source": img},
54
  {"type": "text", "text": prompt},
55
  ],
56
  },
57
  ]
58
-
59
  try:
60
  inputs = processor.apply_chat_template(
61
  messages,
@@ -65,29 +61,30 @@ def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_toke
65
  return_dict=True,
66
  return_tensors="pt"
67
  ).to(model.device)
68
-
69
  gen_tokens = model.generate(
70
  **inputs,
71
  max_new_tokens=int(max_tokens),
72
  do_sample=True,
73
  temperature=float(temperature),
74
  )
75
-
76
  response = processor.tokenizer.decode(
77
- gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
 
78
  )
79
  return response
80
  except Exception as e:
81
  return f"Error generating response: {e}"
82
 
83
- # Example inputs for testing
84
  examples = [
85
  [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
86
  [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
87
  [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
88
  ]
89
 
90
- # Create the Gradio application
91
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
92
  gr.Markdown("# Aya Vision 8B Model Demo")
93
  gr.Markdown("""
@@ -101,46 +98,44 @@ with gr.Blocks(title="Aya Vision 8B Demo") as demo:
101
  Upload an image or provide a URL, and enter a prompt to get started!
102
  """)
103
 
104
- # Display model loading status
105
  gr.Markdown(f"**Model Status:** {model_status}")
106
-
107
- gr.Markdown("### Upload an image or provide an image URL:")
108
  with gr.Tab("Upload Image"):
109
- image_input = gr.Image(label="Upload Image", type="numpy")
110
- image_url_input = gr.Textbox(label="Image URL", placeholder="Leave blank if uploading an image", visible=False)
111
  with gr.Tab("Image URL"):
112
- image_url_visible = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
113
- image_input_url = gr.Image(label="Upload Image", type="numpy", visible=False)
114
-
115
  prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
116
-
117
  with gr.Accordion("Generation Settings", open=False):
118
  temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
119
  max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
120
-
121
  generate_button = gr.Button("Generate Response", variant="primary")
122
-
123
  with gr.Column():
124
  output = gr.Textbox(label="Model Response", lines=10)
125
-
126
  gr.Markdown("### Examples")
127
  gr.Examples(
128
  examples=examples,
129
- inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
130
  outputs=output,
131
  fn=process_image_and_prompt
132
  )
133
-
134
- # Handle generation from either image or URL
135
- def generate_response(image, image_url_visible, prompt, temperature, max_tokens):
136
- return process_image_and_prompt(image, image_url_visible, prompt, temperature, max_tokens)
137
-
138
  generate_button.click(
139
  generate_response,
140
- inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
141
  outputs=output
142
  )
143
 
144
- # Launch the Gradio app
145
  if __name__ == "__main__":
146
  demo.launch()
 
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
 
 
5
  import requests
6
  import os
7
 
8
  hf_token = os.environ.get("HF_TOKEN")
9
  model_id = "CohereForAI/aya-vision-8b"
10
 
11
+ # Load the model and processor during startup.
12
  try:
13
  processor = AutoProcessor.from_pretrained(model_id)
14
  model = AutoModelForImageTextToText.from_pretrained(
 
24
  )
25
 
26
  @spaces.GPU
27
+ def process_image_and_prompt(uploaded_image, image_url, prompt, temperature=0.3, max_tokens=300):
28
  global processor, model
29
 
 
30
  if processor is None or model is None:
31
  return "Model failed to load. Please check the logs."
32
+
33
+ # Determine which image to use:
34
+ # If an image is uploaded, it is returned as a file path.
35
+ if uploaded_image is not None:
36
+ # If the file path does not start with "http", prefix with '/file/' so that
37
+ # the Hugging Face Space can serve it via an HTTP URL.
38
+ img_url = uploaded_image if uploaded_image.startswith("http") else f"/file/{uploaded_image}"
39
  elif image_url and image_url.strip():
40
+ img_url = image_url.strip()
 
 
 
 
41
  else:
42
+ return "Please provide either an image upload or an image URL."
43
+
44
+ # Build the message using the Aya Vision chat template.
45
  messages = [
46
  {
47
  "role": "user",
48
  "content": [
49
+ {"type": "image", "url": img_url},
50
  {"type": "text", "text": prompt},
51
  ],
52
  },
53
  ]
54
+
55
  try:
56
  inputs = processor.apply_chat_template(
57
  messages,
 
61
  return_dict=True,
62
  return_tensors="pt"
63
  ).to(model.device)
64
+
65
  gen_tokens = model.generate(
66
  **inputs,
67
  max_new_tokens=int(max_tokens),
68
  do_sample=True,
69
  temperature=float(temperature),
70
  )
71
+
72
  response = processor.tokenizer.decode(
73
+ gen_tokens[0][inputs.input_ids.shape[1]:],
74
+ skip_special_tokens=True
75
  )
76
  return response
77
  except Exception as e:
78
  return f"Error generating response: {e}"
79
 
80
+ # Example inputs for testing.
81
  examples = [
82
  [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
83
  [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
84
  [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
85
  ]
86
 
87
+ # Build the Gradio interface.
88
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
89
  gr.Markdown("# Aya Vision 8B Model Demo")
90
  gr.Markdown("""
 
98
  Upload an image or provide a URL, and enter a prompt to get started!
99
  """)
100
 
101
+ # Display model loading status.
102
  gr.Markdown(f"**Model Status:** {model_status}")
103
+
104
+ gr.Markdown("### Provide an image (upload or URL):")
105
  with gr.Tab("Upload Image"):
106
+ # Set type to 'filepath' to get the file path from the upload.
107
+ image_input = gr.Image(label="Upload Image", type="filepath")
108
  with gr.Tab("Image URL"):
109
+ image_url_input = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
110
+
 
111
  prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
112
+
113
  with gr.Accordion("Generation Settings", open=False):
114
  temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
115
  max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
116
+
117
  generate_button = gr.Button("Generate Response", variant="primary")
118
+
119
  with gr.Column():
120
  output = gr.Textbox(label="Model Response", lines=10)
121
+
122
  gr.Markdown("### Examples")
123
  gr.Examples(
124
  examples=examples,
125
+ inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
126
  outputs=output,
127
  fn=process_image_and_prompt
128
  )
129
+
130
+ # Determine which image input to use when generating the response.
131
+ def generate_response(uploaded_image, image_url, prompt, temperature, max_tokens):
132
+ return process_image_and_prompt(uploaded_image, image_url, prompt, temperature, max_tokens)
133
+
134
  generate_button.click(
135
  generate_response,
136
+ inputs=[image_input, image_url_input, prompt, temperature, max_tokens],
137
  outputs=output
138
  )
139
 
 
140
  if __name__ == "__main__":
141
  demo.launch()