Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -38,7 +38,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
|
|
38 |
seed = random.randint(0, MAX_SEED)
|
39 |
return seed
|
40 |
|
41 |
-
# Load Flux.1 pipeline and LoRA weights
|
42 |
from diffusers import DiffusionPipeline
|
43 |
|
44 |
base_model = "black-forest-labs/FLUX.1-dev"
|
@@ -106,7 +105,7 @@ def generate_image_flux(
|
|
106 |
# SMOLVLM2 SETUP (Default Text/Multimodal Model)
|
107 |
# -------------------------------
|
108 |
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
|
109 |
-
|
110 |
smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
|
111 |
smol_model = AutoModelForImageTextToText.from_pretrained(
|
112 |
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
@@ -136,7 +135,6 @@ def progress_bar_html(label: str) -> str:
|
|
136 |
</style>
|
137 |
'''
|
138 |
|
139 |
-
# TTS voices (if using TTS commands)
|
140 |
TTS_VOICES = [
|
141 |
"en-US-JennyNeural", # @tts1
|
142 |
"en-US-GuyNeural", # @tts2
|
@@ -193,7 +191,7 @@ def generate(
|
|
193 |
voice = TTS_VOICES[voice_index - 1]
|
194 |
text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
|
195 |
|
196 |
-
#
|
197 |
yield "Processing with SmolVLM2"
|
198 |
|
199 |
# Build conversation messages based on input and history.
|
@@ -270,6 +268,9 @@ def generate(
|
|
270 |
return_dict=True,
|
271 |
return_tensors="pt",
|
272 |
)
|
|
|
|
|
|
|
273 |
inputs = inputs.to(smol_model.device)
|
274 |
|
275 |
streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)
|
|
|
38 |
seed = random.randint(0, MAX_SEED)
|
39 |
return seed
|
40 |
|
|
|
41 |
from diffusers import DiffusionPipeline
|
42 |
|
43 |
base_model = "black-forest-labs/FLUX.1-dev"
|
|
|
105 |
# SMOLVLM2 SETUP (Default Text/Multimodal Model)
|
106 |
# -------------------------------
|
107 |
from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
|
108 |
+
|
109 |
smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
|
110 |
smol_model = AutoModelForImageTextToText.from_pretrained(
|
111 |
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
|
|
|
135 |
</style>
|
136 |
'''
|
137 |
|
|
|
138 |
TTS_VOICES = [
|
139 |
"en-US-JennyNeural", # @tts1
|
140 |
"en-US-GuyNeural", # @tts2
|
|
|
191 |
voice = TTS_VOICES[voice_index - 1]
|
192 |
text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
|
193 |
|
194 |
+
# Use SmolVLM2 for chat/multimodal text generation.
|
195 |
yield "Processing with SmolVLM2"
|
196 |
|
197 |
# Build conversation messages based on input and history.
|
|
|
268 |
return_dict=True,
|
269 |
return_tensors="pt",
|
270 |
)
|
271 |
+
# Explicitly cast pixel values to bfloat16 to match model weights.
|
272 |
+
if "pixel_values" in inputs:
|
273 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
274 |
inputs = inputs.to(smol_model.device)
|
275 |
|
276 |
streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)
|