prithivMLmods commited on
Commit
65928b6
·
verified ·
1 Parent(s): 8a2ba41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +239 -173
app.py CHANGED
@@ -4,10 +4,7 @@ import uuid
4
  import json
5
  import time
6
  import asyncio
7
- import re
8
  from threading import Thread
9
- from io import BytesIO
10
- import subprocess
11
 
12
  import gradio as gr
13
  import spaces
@@ -16,57 +13,98 @@ import numpy as np
16
  from PIL import Image
17
  import edge_tts
18
 
19
- # Install flash-attn without building CUDA kernels (if needed)
20
- subprocess.run(
21
- 'pip install flash-attn --no-build-isolation',
22
- env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
23
- shell=True
 
24
  )
25
-
26
- from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
27
  from diffusers import DiffusionPipeline
28
 
29
- # ------------------------------------------------------------------------------
30
- # Global Configurations
31
- # ------------------------------------------------------------------------------
32
- DESCRIPTION = "# SmolVLM2 with Flux.1 Integration 📺"
33
- if not torch.cuda.is_available():
34
- DESCRIPTION += "\n<p>⚠️Running on CPU, This may not work on CPU.</p>"
35
 
36
  css = '''
37
  h1 {
38
  text-align: center;
39
  display: block;
40
  }
 
 
 
 
 
 
 
41
  '''
42
 
 
 
 
 
43
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
44
 
45
- # ------------------------------------------------------------------------------
46
- # FLUX.1 IMAGE GENERATION SETUP
47
- # ------------------------------------------------------------------------------
48
- MAX_SEED = np.iinfo(np.int32).max
49
 
50
- def save_image(img: Image.Image) -> str:
51
- """Save a PIL image with a unique filename and return the path."""
52
- unique_name = str(uuid.uuid4()) + ".png"
53
- img.save(unique_name)
54
- return unique_name
 
 
 
 
55
 
56
- def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
57
- if randomize_seed:
58
- seed = random.randint(0, MAX_SEED)
59
- return seed
60
 
61
- # Initialize Flux.1 pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  base_model = "black-forest-labs/FLUX.1-dev"
63
  pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
64
  lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
65
- trigger_word = "Super Realism" # Leave blank if no trigger word is needed.
66
  pipe.load_lora_weights(lora_repo)
67
  pipe.to("cuda")
68
 
69
- # Define style prompts for Flux.1
70
  style_list = [
71
  {
72
  "name": "3840 x 2160",
@@ -85,14 +123,48 @@ style_list = [
85
  "prompt": "{prompt}",
86
  },
87
  ]
88
- styles = {s["name"]: s["prompt"] for s in style_list}
89
  DEFAULT_STYLE_NAME = "3840 x 2160"
90
  STYLE_NAMES = list(styles.keys())
91
 
92
  def apply_style(style_name: str, positive: str) -> str:
93
  return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)
94
 
95
- def generate_image_flux(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  prompt: str,
97
  seed: int = 0,
98
  width: int = 1024,
@@ -100,8 +172,9 @@ def generate_image_flux(
100
  guidance_scale: float = 3,
101
  randomize_seed: bool = False,
102
  style_name: str = DEFAULT_STYLE_NAME,
 
103
  ):
104
- """Generate an image using the Flux.1 pipeline with style prompts."""
105
  seed = int(randomize_seed_fn(seed, randomize_seed))
106
  positive_prompt = apply_style(style_name, prompt)
107
  if trigger_word:
@@ -118,38 +191,36 @@ def generate_image_flux(
118
  image_paths = [save_image(img) for img in images]
119
  return image_paths, seed
120
 
121
- # ------------------------------------------------------------------------------
122
- # SMOLVLM2 MODEL SETUP
123
- # ------------------------------------------------------------------------------
124
- processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
125
- model = AutoModelForImageTextToText.from_pretrained(
126
- "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
127
- _attn_implementation="flash_attention_2",
128
- torch_dtype=torch.bfloat16
129
- ).to("cuda:0")
130
-
131
- # ------------------------------------------------------------------------------
132
- # CHAT / INFERENCE FUNCTION
133
- # ------------------------------------------------------------------------------
134
  @spaces.GPU
135
- def model_inference(input_dict, history, max_tokens):
 
 
 
 
 
 
 
 
136
  """
137
- Implements a chat interface using SmolVLM2.
138
-
139
- Special behavior:
140
- - If the query text starts with "@image", the Flux.1 pipeline is used to generate an image.
141
- - Otherwise, the query is processed with SmolVLM2.
142
- - In the SmolVLM2 branch, a progress message "Processing with SmolVLM2..." is yielded.
143
  """
144
  text = input_dict["text"]
145
  files = input_dict.get("files", [])
146
-
147
- # If the text begins with "@image", use Flux.1 image generation.
148
  if text.strip().lower().startswith("@image"):
149
- prompt = text[len("@image"):].strip()
150
- yield "Hold Tight Generating Flux.1 Image..."
151
- image_paths, used_seed = generate_image_flux(
152
- prompt=prompt,
 
 
153
  seed=1,
154
  width=1024,
155
  height=1024,
@@ -157,126 +228,121 @@ def model_inference(input_dict, history, max_tokens):
157
  randomize_seed=True,
158
  style_name=DEFAULT_STYLE_NAME,
159
  )
 
160
  yield gr.Image(image_paths[0])
161
- return
162
-
163
- # Default: Use SmolVLM2 inference.
164
- yield "Processing with SmolVLM2..."
165
-
166
- user_content = []
167
- media_queue = []
168
 
169
- # If no conversation history, process current input.
170
- if not history:
171
- text = text.strip()
172
- for file in files:
173
- if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
174
- media_queue.append({"type": "image", "path": file})
175
- elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
176
- media_queue.append({"type": "video", "path": file})
177
- if "<image>" in text or "<video>" in text:
178
- parts = re.split(r'(<image>|<video>)', text)
179
- for part in parts:
180
- if part == "<image>" and media_queue:
181
- user_content.append(media_queue.pop(0))
182
- elif part == "<video>" and media_queue:
183
- user_content.append(media_queue.pop(0))
184
- elif part.strip():
185
- user_content.append({"type": "text", "text": part.strip()})
186
  else:
187
- user_content.append({"type": "text", "text": text})
188
- for media in media_queue:
189
- user_content.append(media)
190
- resulting_messages = [{"role": "user", "content": user_content}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  else:
192
- resulting_messages = []
193
- user_content = []
194
- media_queue = []
195
- for hist in history:
196
- if hist["role"] == "user" and isinstance(hist["content"], tuple):
197
- file_name = hist["content"][0]
198
- if file_name.endswith((".png", ".jpg", ".jpeg")):
199
- media_queue.append({"type": "image", "path": file_name})
200
- elif file_name.endswith(".mp4"):
201
- media_queue.append({"type": "video", "path": file_name})
202
- for hist in history:
203
- if hist["role"] == "user" and isinstance(hist["content"], str):
204
- text = hist["content"]
205
- parts = re.split(r'(<image>|<video>)', text)
206
- for part in parts:
207
- if part == "<image>" and media_queue:
208
- user_content.append(media_queue.pop(0))
209
- elif part == "<video>" and media_queue:
210
- user_content.append(media_queue.pop(0))
211
- elif part.strip():
212
- user_content.append({"type": "text", "text": part.strip()})
213
- elif hist["role"] == "assistant":
214
- resulting_messages.append({
215
- "role": "user",
216
- "content": user_content
217
- })
218
- resulting_messages.append({
219
- "role": "assistant",
220
- "content": [{"type": "text", "text": hist["content"]}]
221
- })
222
- user_content = []
223
- if user_content:
224
- resulting_messages.append({"role": "user", "content": user_content})
225
-
226
- if text == "" and not files:
227
- yield gr.Error("Please input a query and optionally image(s).")
228
- return
229
- if text == "" and files:
230
- yield gr.Error("Please input a text query along with the image(s).")
231
- return
232
-
233
- print("resulting_messages", resulting_messages)
234
- inputs = processor.apply_chat_template(
235
- resulting_messages,
236
- add_generation_prompt=True,
237
- tokenize=True,
238
- return_dict=True,
239
- return_tensors="pt",
240
- )
241
- inputs = inputs.to(model.device)
242
-
243
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
244
- generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
245
-
246
- thread = Thread(target=model.generate, kwargs=generation_args)
247
- thread.start()
248
-
249
- buffer = ""
250
- for new_text in streamer:
251
- buffer += new_text
252
- time.sleep(0.01)
253
- yield buffer
254
-
255
- # ------------------------------------------------------------------------------
256
- # GRADIO CHAT INTERFACE
257
- # ------------------------------------------------------------------------------
258
- examples = [
259
- [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
260
- [{"text": "What art era does this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
261
- [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
262
- [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
263
- [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
264
- [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
265
- [{"text": "@image A futuristic cityscape with vibrant neon lights"}],
266
- ]
267
 
268
  demo = gr.ChatInterface(
269
- fn=model_inference,
270
- title="SmolVLM2 with Flux.1 Integration 📺",
271
- description="Play with SmolVLM2 (HuggingFaceTB/SmolVLM2-2.2B-Instruct) with integrated Flux.1 image generation. Use the '@image' prefix to generate images with Flux.1.",
272
- examples=examples,
273
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  stop_btn="Stop Generation",
275
  multimodal=True,
276
- cache_examples=False,
277
- additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
278
- type="messages"
279
  )
280
 
281
  if __name__ == "__main__":
282
- demo.launch(debug=True)
 
4
  import json
5
  import time
6
  import asyncio
 
7
  from threading import Thread
 
 
8
 
9
  import gradio as gr
10
  import spaces
 
13
  from PIL import Image
14
  import edge_tts
15
 
16
+ from transformers import (
17
+ AutoModelForCausalLM,
18
+ AutoTokenizer,
19
+ TextIteratorStreamer,
20
+ Qwen2VLForConditionalGeneration,
21
+ AutoProcessor,
22
  )
23
+ from transformers.image_utils import load_image
 
24
  from diffusers import DiffusionPipeline
25
 
26
+ DESCRIPTION = """
27
+ # QwQ Edge 💬 with Flux.1
28
+ """
 
 
 
29
 
30
  css = '''
31
  h1 {
32
  text-align: center;
33
  display: block;
34
  }
35
+
36
+ #duplicate-button {
37
+ margin: auto;
38
+ color: #fff;
39
+ background: #1565c0;
40
+ border-radius: 100vh;
41
+ }
42
  '''
43
 
44
+ MAX_MAX_NEW_TOKENS = 2048
45
+ DEFAULT_MAX_NEW_TOKENS = 1024
46
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
47
+
48
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
49
 
50
+ # --------------------------
51
+ # Text Generation Components
52
+ # --------------------------
 
53
 
54
+ # Load text-only model and tokenizer
55
+ model_id = "prithivMLmods/FastThink-0.5B-Tiny"
56
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
57
+ model = AutoModelForCausalLM.from_pretrained(
58
+ model_id,
59
+ device_map="auto",
60
+ torch_dtype=torch.bfloat16,
61
+ )
62
+ model.eval()
63
 
64
+ TTS_VOICES = [
65
+ "en-US-JennyNeural", # @tts1
66
+ "en-US-GuyNeural", # @tts2
67
+ ]
68
 
69
+ # Multimodal model (text+vision)
70
+ MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
71
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
72
+ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
73
+ MODEL_ID,
74
+ trust_remote_code=True,
75
+ torch_dtype=torch.float16
76
+ ).to("cuda").eval()
77
+
78
+ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
79
+ """Convert text to speech using Edge TTS and save as MP3"""
80
+ communicate = edge_tts.Communicate(text, voice)
81
+ await communicate.save(output_file)
82
+ return output_file
83
+
84
+ def clean_chat_history(chat_history):
85
+ """
86
+ Filter out any chat entries whose "content" is not a string.
87
+ This helps prevent errors when concatenating previous messages.
88
+ """
89
+ cleaned = []
90
+ for msg in chat_history:
91
+ if isinstance(msg, dict) and isinstance(msg.get("content"), str):
92
+ cleaned.append(msg)
93
+ return cleaned
94
+
95
+ # --------------------------
96
+ # Flux.1 Image Generation
97
+ # --------------------------
98
+
99
+ # Set up the Flux.1 pipeline
100
  base_model = "black-forest-labs/FLUX.1-dev"
101
  pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
102
  lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
103
+ trigger_word = "Super Realism" # Leave trigger_word blank if not used.
104
  pipe.load_lora_weights(lora_repo)
105
  pipe.to("cuda")
106
 
107
+ # Define style prompts
108
  style_list = [
109
  {
110
  "name": "3840 x 2160",
 
123
  "prompt": "{prompt}",
124
  },
125
  ]
126
+ styles = {k["name"]: k["prompt"] for k in style_list}
127
  DEFAULT_STYLE_NAME = "3840 x 2160"
128
  STYLE_NAMES = list(styles.keys())
129
 
130
  def apply_style(style_name: str, positive: str) -> str:
131
  return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)
132
 
133
+ MAX_SEED = np.iinfo(np.int32).max
134
+
135
+ def save_image(img: Image.Image) -> str:
136
+ """Save a PIL image with a unique filename and return the path."""
137
+ unique_name = str(uuid.uuid4()) + ".png"
138
+ img.save(unique_name)
139
+ return unique_name
140
+
141
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
142
+ if randomize_seed:
143
+ seed = random.randint(0, MAX_SEED)
144
+ return seed
145
+
146
+ def progress_bar_html(label: str) -> str:
147
+ """
148
+ Returns an HTML snippet for a thin progress bar with a label.
149
+ The progress bar is styled as a dark red animated bar.
150
+ """
151
+ return f'''
152
+ <div style="display: flex; align-items: center;">
153
+ <span style="margin-right: 10px; font-size: 14px;">{label}</span>
154
+ <div style="width: 110px; height: 5px; background-color: #f0f0f0; border-radius: 2px; overflow: hidden;">
155
+ <div style="width: 100%; height: 100%; background-color: #ff5900; animation: loading 1.5s linear infinite;"></div>
156
+ </div>
157
+ </div>
158
+ <style>
159
+ @keyframes loading {{
160
+ 0% {{ transform: translateX(-100%); }}
161
+ 100% {{ transform: translateX(100%); }}
162
+ }}
163
+ </style>
164
+ '''
165
+
166
+ @spaces.GPU(duration=60, enable_queue=True)
167
+ def generate_image_fn(
168
  prompt: str,
169
  seed: int = 0,
170
  width: int = 1024,
 
172
  guidance_scale: float = 3,
173
  randomize_seed: bool = False,
174
  style_name: str = DEFAULT_STYLE_NAME,
175
+ progress=gr.Progress(track_tqdm=True),
176
  ):
177
+ """Generate images using the Flux.1 pipeline."""
178
  seed = int(randomize_seed_fn(seed, randomize_seed))
179
  positive_prompt = apply_style(style_name, prompt)
180
  if trigger_word:
 
191
  image_paths = [save_image(img) for img in images]
192
  return image_paths, seed
193
 
194
+ # --------------------------
195
+ # Chat and Multimodal Generation
196
+ # --------------------------
197
+
 
 
 
 
 
 
 
 
 
198
  @spaces.GPU
199
+ def generate(
200
+ input_dict: dict,
201
+ chat_history: list[dict],
202
+ max_new_tokens: int = 1024,
203
+ temperature: float = 0.6,
204
+ top_p: float = 0.9,
205
+ top_k: int = 50,
206
+ repetition_penalty: float = 1.2,
207
+ ):
208
  """
209
+ Generates chatbot responses with support for multimodal input, TTS, and image generation using Flux.1.
210
+ Special commands:
211
+ - "@tts1" or "@tts2": triggers text-to-speech.
212
+ - "@image": triggers image generation using the Flux.1 pipeline.
 
 
213
  """
214
  text = input_dict["text"]
215
  files = input_dict.get("files", [])
216
+
 
217
  if text.strip().lower().startswith("@image"):
218
+ # Remove the "@image" tag and use the rest as prompt
219
+ prompt_img = text[len("@image"):].strip()
220
+ # Show animated progress bar for image generation
221
+ yield progress_bar_html("Generating Image")
222
+ image_paths, used_seed = generate_image_fn(
223
+ prompt=prompt_img,
224
  seed=1,
225
  width=1024,
226
  height=1024,
 
228
  randomize_seed=True,
229
  style_name=DEFAULT_STYLE_NAME,
230
  )
231
+ # Once done, yield the generated image
232
  yield gr.Image(image_paths[0])
233
+ return # Exit early
234
+
235
+ tts_prefix = "@tts"
236
+ is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
237
+ voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
 
 
238
 
239
+ if is_tts and voice_index:
240
+ voice = TTS_VOICES[voice_index - 1]
241
+ text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
242
+ # Clear previous chat history for a fresh TTS request.
243
+ conversation = [{"role": "user", "content": text}]
244
+ else:
245
+ voice = None
246
+ # Remove any stray @tts tags and build the conversation history.
247
+ text = text.replace(tts_prefix, "").strip()
248
+ conversation = clean_chat_history(chat_history)
249
+ conversation.append({"role": "user", "content": text})
250
+
251
+ if files:
252
+ if len(files) > 1:
253
+ images = [load_image(image) for image in files]
254
+ elif len(files) == 1:
255
+ images = [load_image(files[0])]
256
  else:
257
+ images = []
258
+ messages = [{
259
+ "role": "user",
260
+ "content": [
261
+ *[{"type": "image", "image": image} for image in images],
262
+ {"type": "text", "text": text},
263
+ ]
264
+ }]
265
+ prompt_multimodal = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
266
+ inputs = processor(text=[prompt_multimodal], images=images, return_tensors="pt", padding=True).to("cuda")
267
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
268
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
269
+ thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
270
+ thread.start()
271
+
272
+ buffer = ""
273
+ # Show animated progress bar for multimodal generation
274
+ yield progress_bar_html("Thinking...")
275
+ for new_text in streamer:
276
+ buffer += new_text
277
+ buffer = buffer.replace("<|im_end|>", "")
278
+ time.sleep(0.01)
279
+ yield buffer
280
  else:
281
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
282
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
283
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
284
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
285
+ input_ids = input_ids.to(model.device)
286
+ streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
287
+ generation_kwargs = {
288
+ "input_ids": input_ids,
289
+ "streamer": streamer,
290
+ "max_new_tokens": max_new_tokens,
291
+ "do_sample": True,
292
+ "top_p": top_p,
293
+ "top_k": top_k,
294
+ "temperature": temperature,
295
+ "num_beams": 1,
296
+ "repetition_penalty": repetition_penalty,
297
+ }
298
+ t = Thread(target=model.generate, kwargs=generation_kwargs)
299
+ t.start()
300
+
301
+ outputs = []
302
+ # Show animated progress bar for text generation
303
+ yield progress_bar_html("Thinking...")
304
+ for new_text in streamer:
305
+ outputs.append(new_text)
306
+ yield "".join(outputs)
307
+
308
+ final_response = "".join(outputs)
309
+ yield final_response
310
+
311
+ # If TTS was requested, convert the final response to speech.
312
+ if is_tts and voice:
313
+ output_file = asyncio.run(text_to_speech(final_response, voice))
314
+ yield gr.Audio(output_file, autoplay=True)
315
+
316
+ # --------------------------
317
+ # Gradio Chat Interface
318
+ # --------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  demo = gr.ChatInterface(
321
+ fn=generate,
322
+ additional_inputs=[
323
+ gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
324
+ gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
325
+ gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
326
+ gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
327
+ gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
328
+ ],
329
+ examples=[
330
+ ["@image A futuristic cityscape at sunset with vibrant colors"],
331
+ ["Python Program for Array Rotation"],
332
+ ["@tts1 Who is Nikola Tesla, and why did he die?"],
333
+ [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
334
+ [{"text": "summarize the letter", "files": ["examples/1.png"]}],
335
+ ["@tts2 What causes rainbows to form?"],
336
+ ],
337
+ cache_examples=False,
338
+ type="messages",
339
+ description=DESCRIPTION,
340
+ css=css,
341
+ fill_height=True,
342
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image-image gen, default [text, vision]"),
343
  stop_btn="Stop Generation",
344
  multimodal=True,
 
 
 
345
  )
346
 
347
  if __name__ == "__main__":
348
+ demo.queue(max_size=20).launch(share=True)