salma-remyx commited on
Commit
707a904
·
1 Parent(s): de240ef

update inputs

Browse files
Files changed (1) hide show
  1. app.py +17 -68
app.py CHANGED
@@ -22,9 +22,7 @@ def load_model():
22
  return model, processor
23
 
24
  def process_image(image_path_or_obj):
25
- """Loads, resizes, and preprocesses an image path or Pillow Image."""
26
  if isinstance(image_path_or_obj, str):
27
- # Path on disk or from history
28
  image = Image.open(image_path_or_obj).convert("RGB")
29
  elif isinstance(image_path_or_obj, Image.Image):
30
  image = image_path_or_obj.convert("RGB")
@@ -36,45 +34,24 @@ def process_image(image_path_or_obj):
36
  aspect_ratio = image.height / image.width
37
  new_height = int(max_width * aspect_ratio)
38
  image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
39
- print(f"Resized image to: {max_width}x{new_height}")
40
  return image
41
 
42
  def get_latest_image(history):
43
- """
44
- Look from the end to find the last user-uploaded image (stored as (file_path,) ).
45
- Return None if not found.
46
- """
47
- for user_msg, _assistant_msg in reversed(history):
48
- if isinstance(user_msg, tuple) and len(user_msg) > 0:
49
- return user_msg[0]
50
  return None
51
 
52
  def only_assistant_text(full_text: str) -> str:
53
- """
54
- Helper to strip out any lines containing 'system', 'user', etc.,
55
- and return only the final assistant answer.
56
- Adjust this parsing if your model's output format differs.
57
- """
58
- # Example output might look like:
59
- # system
60
- # ...
61
- # user
62
- # ...
63
- # assistant
64
- # The final answer
65
- #
66
- # We'll just split on 'assistant' and return everything after it.
67
  if "assistant" in full_text:
68
  parts = full_text.split("assistant", 1)
69
  result = parts[-1].strip()
70
- # Remove any leading punctuation (like a colon)
71
  result = result.lstrip(":").strip()
72
  return result
73
  return full_text.strip()
74
 
75
  def run_inference(image, prompt):
76
  model, processor = load_model()
77
- """Runs Qwen2.5-VL inference on a single image and text prompt."""
78
  system_msg = (
79
  "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. "
80
  "You should first think about the reasoning process and then provide the answer. "
@@ -100,100 +77,73 @@ def run_inference(image, prompt):
100
  inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
101
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
102
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
103
- # Parse out only the final assistant text
104
  return only_assistant_text(output_text)
105
 
106
  def add_message(history, user_input):
107
- """
108
- Step 1 (triggered by user's 'Submit' or 'Send'):
109
- - Save new text or images into `history`.
110
- - The Chatbot display uses pairs: [user_text_or_image, assistant_reply].
111
- """
112
  if not isinstance(history, list):
113
  history = []
114
 
115
  files = user_input.get("files", [])
116
  text = user_input.get("text", "")
117
 
118
- # Store images
119
  for f in files:
120
- # Each image is stored as `[(file_path,), None]`
121
- history.append([(f,), None])
122
 
123
- # Store text
124
  if text:
125
- history.append([text, None])
126
 
127
  return history, gr.MultimodalTextbox(value=None)
128
 
129
  def inference_interface(history):
130
- """
131
- Step 2: Use the most recent text + the most recent image to run Qwen2.5-VL.
132
- Instead of adding another entry, we fill the assistant's answer into
133
- the last user text entry.
134
- """
135
  if not history:
136
  return history, gr.MultimodalTextbox(value=None)
137
 
138
- # 1) Get the user's most recent text
139
  user_text = ""
140
- # We'll search from the end for the first str we find
141
  for idx in range(len(history) - 1, -1, -1):
142
- user_msg, assistant_msg = history[idx]
143
- if isinstance(user_msg, str):
144
- user_text = user_msg
145
- # We'll also keep track of this index so we can fill in the assistant reply
146
  user_idx = idx
147
  break
148
- else:
149
- # No user text found
150
- print("No user text found in history. Skipping inference.")
151
  return history, gr.MultimodalTextbox(value=None)
152
 
153
- # 2) Get the latest image from the entire conversation
154
  latest_image = get_latest_image(history)
155
  if not latest_image:
156
- # No image found => can't run the model
157
- print("No image found in history. Skipping inference.")
158
  return history, gr.MultimodalTextbox(value=None)
159
 
160
- # 3) Process the image
161
  pil_image = process_image(latest_image)
162
-
163
- # 4) Run inference
164
  assistant_reply = run_inference(pil_image, user_text)
165
 
166
- # 5) Fill that assistant reply back into the last user text entry
167
- history[user_idx][1] = assistant_reply
168
  return history, gr.MultimodalTextbox(value=None)
169
 
170
  def build_demo():
171
  with gr.Blocks() as demo:
172
  gr.Markdown("# SpaceThinker-Qwen2.5VL-3B Image Prompt Chatbot")
173
 
174
- chatbot = gr.Chatbot([], line_breaks=True)
 
175
  chat_input = gr.MultimodalTextbox(
176
  interactive=True,
177
  file_types=["image"],
178
  placeholder="Enter text and upload an image.",
179
- show_label=True,
180
- preprocess=False # 👈 prevent gradio from parsing input prematurely
181
  )
182
 
183
- # When the user presses Enter in the MultimodalTextbox:
184
  submit_event = chat_input.submit(
185
- fn=add_message, # Step 1: store user data
186
  inputs=[chatbot, chat_input],
187
  outputs=[chatbot, chat_input]
188
  )
189
- # After storing, run inference
190
  submit_event.then(
191
- fn=inference_interface, # Step 2: run Qwen2.5-VL
192
  inputs=[chatbot],
193
  outputs=[chatbot, chat_input]
194
  )
195
 
196
- # Same logic for a "Send" button
197
  with gr.Row():
198
  send_button = gr.Button("Send")
199
  clear_button = gr.ClearButton([chatbot, chat_input])
@@ -209,7 +159,6 @@ def build_demo():
209
  outputs=[chatbot, chat_input]
210
  )
211
 
212
- # Example
213
  gr.Examples(
214
  examples=[
215
  {
 
22
  return model, processor
23
 
24
  def process_image(image_path_or_obj):
 
25
  if isinstance(image_path_or_obj, str):
 
26
  image = Image.open(image_path_or_obj).convert("RGB")
27
  elif isinstance(image_path_or_obj, Image.Image):
28
  image = image_path_or_obj.convert("RGB")
 
34
  aspect_ratio = image.height / image.width
35
  new_height = int(max_width * aspect_ratio)
36
  image = image.resize((max_width, new_height), Image.Resampling.LANCZOS)
 
37
  return image
38
 
39
  def get_latest_image(history):
40
+ for item in reversed(history):
41
+ if item["role"] == "user" and isinstance(item["content"], tuple):
42
+ return item["content"][0]
 
 
 
 
43
  return None
44
 
45
  def only_assistant_text(full_text: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  if "assistant" in full_text:
47
  parts = full_text.split("assistant", 1)
48
  result = parts[-1].strip()
 
49
  result = result.lstrip(":").strip()
50
  return result
51
  return full_text.strip()
52
 
53
  def run_inference(image, prompt):
54
  model, processor = load_model()
 
55
  system_msg = (
56
  "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. "
57
  "You should first think about the reasoning process and then provide the answer. "
 
77
  inputs = processor(text=[text_input], images=[image], return_tensors="pt").to(model.device)
78
  generated_ids = model.generate(**inputs, max_new_tokens=1024)
79
  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
80
  return only_assistant_text(output_text)
81
 
82
  def add_message(history, user_input):
 
 
 
 
 
83
  if not isinstance(history, list):
84
  history = []
85
 
86
  files = user_input.get("files", [])
87
  text = user_input.get("text", "")
88
 
 
89
  for f in files:
90
+ history.append({"role": "user", "content": (f,)})
 
91
 
 
92
  if text:
93
+ history.append({"role": "user", "content": text})
94
 
95
  return history, gr.MultimodalTextbox(value=None)
96
 
97
  def inference_interface(history):
 
 
 
 
 
98
  if not history:
99
  return history, gr.MultimodalTextbox(value=None)
100
 
 
101
  user_text = ""
102
+ user_idx = -1
103
  for idx in range(len(history) - 1, -1, -1):
104
+ msg = history[idx]
105
+ if msg["role"] == "user" and isinstance(msg["content"], str):
106
+ user_text = msg["content"]
 
107
  user_idx = idx
108
  break
109
+
110
+ if user_idx == -1:
 
111
  return history, gr.MultimodalTextbox(value=None)
112
 
 
113
  latest_image = get_latest_image(history)
114
  if not latest_image:
 
 
115
  return history, gr.MultimodalTextbox(value=None)
116
 
 
117
  pil_image = process_image(latest_image)
 
 
118
  assistant_reply = run_inference(pil_image, user_text)
119
 
120
+ history.append({"role": "assistant", "content": assistant_reply})
 
121
  return history, gr.MultimodalTextbox(value=None)
122
 
123
  def build_demo():
124
  with gr.Blocks() as demo:
125
  gr.Markdown("# SpaceThinker-Qwen2.5VL-3B Image Prompt Chatbot")
126
 
127
+ chatbot = gr.Chatbot([], type="messages", line_breaks=True)
128
+
129
  chat_input = gr.MultimodalTextbox(
130
  interactive=True,
131
  file_types=["image"],
132
  placeholder="Enter text and upload an image.",
133
+ show_label=True
 
134
  )
135
 
 
136
  submit_event = chat_input.submit(
137
+ fn=add_message,
138
  inputs=[chatbot, chat_input],
139
  outputs=[chatbot, chat_input]
140
  )
 
141
  submit_event.then(
142
+ fn=inference_interface,
143
  inputs=[chatbot],
144
  outputs=[chatbot, chat_input]
145
  )
146
 
 
147
  with gr.Row():
148
  send_button = gr.Button("Send")
149
  clear_button = gr.ClearButton([chatbot, chat_input])
 
159
  outputs=[chatbot, chat_input]
160
  )
161
 
 
162
  gr.Examples(
163
  examples=[
164
  {