prithivMLmods commited on
Commit
ad4e69f
·
verified ·
1 Parent(s): fe6cb74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -38,7 +38,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
38
  seed = random.randint(0, MAX_SEED)
39
  return seed
40
 
41
- # Load Flux.1 pipeline and LoRA weights
42
  from diffusers import DiffusionPipeline
43
 
44
  base_model = "black-forest-labs/FLUX.1-dev"
@@ -106,7 +105,7 @@ def generate_image_flux(
106
  # SMOLVLM2 SETUP (Default Text/Multimodal Model)
107
  # -------------------------------
108
  from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
109
- # Load the SmolVLM2 processor and model with flash attention enabled.
110
  smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
111
  smol_model = AutoModelForImageTextToText.from_pretrained(
112
  "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
@@ -136,7 +135,6 @@ def progress_bar_html(label: str) -> str:
136
  </style>
137
  '''
138
 
139
- # TTS voices (if using TTS commands)
140
  TTS_VOICES = [
141
  "en-US-JennyNeural", # @tts1
142
  "en-US-GuyNeural", # @tts2
@@ -193,7 +191,7 @@ def generate(
193
  voice = TTS_VOICES[voice_index - 1]
194
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
195
 
196
- # Now use SmolVLM2 for chat/multimodal text generation.
197
  yield "Processing with SmolVLM2"
198
 
199
  # Build conversation messages based on input and history.
@@ -270,6 +268,9 @@ def generate(
270
  return_dict=True,
271
  return_tensors="pt",
272
  )
 
 
 
273
  inputs = inputs.to(smol_model.device)
274
 
275
  streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)
 
38
  seed = random.randint(0, MAX_SEED)
39
  return seed
40
 
 
41
  from diffusers import DiffusionPipeline
42
 
43
  base_model = "black-forest-labs/FLUX.1-dev"
 
105
  # SMOLVLM2 SETUP (Default Text/Multimodal Model)
106
  # -------------------------------
107
  from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
108
+
109
  smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
110
  smol_model = AutoModelForImageTextToText.from_pretrained(
111
  "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
 
135
  </style>
136
  '''
137
 
 
138
  TTS_VOICES = [
139
  "en-US-JennyNeural", # @tts1
140
  "en-US-GuyNeural", # @tts2
 
191
  voice = TTS_VOICES[voice_index - 1]
192
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
193
 
194
+ # Use SmolVLM2 for chat/multimodal text generation.
195
  yield "Processing with SmolVLM2"
196
 
197
  # Build conversation messages based on input and history.
 
268
  return_dict=True,
269
  return_tensors="pt",
270
  )
271
+ # Explicitly cast pixel values to bfloat16 to match model weights.
272
+ if "pixel_values" in inputs:
273
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
274
  inputs = inputs.to(smol_model.device)
275
 
276
  streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)