Spaces:

prithivMLmods
/

FLUX-REALISM

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 6

Commit

ad4e69f

verified ·

1 Parent(s): fe6cb74

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -4

app.py CHANGED Viewed

@@ -38,7 +38,6 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
         seed = random.randint(0, MAX_SEED)
     return seed
-# Load Flux.1 pipeline and LoRA weights
 from diffusers import DiffusionPipeline
 base_model = "black-forest-labs/FLUX.1-dev"
@@ -106,7 +105,7 @@ def generate_image_flux(
 # SMOLVLM2 SETUP (Default Text/Multimodal Model)
 # -------------------------------
 from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
-# Load the SmolVLM2 processor and model with flash attention enabled.
 smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
 smol_model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
@@ -136,7 +135,6 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-# TTS voices (if using TTS commands)
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
@@ -193,7 +191,7 @@ def generate(
             voice = TTS_VOICES[voice_index - 1]
             text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-    # Now use SmolVLM2 for chat/multimodal text generation.
     yield "Processing with SmolVLM2"
     # Build conversation messages based on input and history.
@@ -270,6 +268,9 @@ def generate(
         return_dict=True,
         return_tensors="pt",
     )
     inputs = inputs.to(smol_model.device)
     streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)

         seed = random.randint(0, MAX_SEED)
     return seed
 from diffusers import DiffusionPipeline
 base_model = "black-forest-labs/FLUX.1-dev"
 # SMOLVLM2 SETUP (Default Text/Multimodal Model)
 # -------------------------------
 from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 smol_processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
 smol_model = AutoModelForImageTextToText.from_pretrained(
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
 </style>
     '''
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
             voice = TTS_VOICES[voice_index - 1]
             text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+    # Use SmolVLM2 for chat/multimodal text generation.
     yield "Processing with SmolVLM2"
     # Build conversation messages based on input and history.
         return_dict=True,
         return_tensors="pt",
     )
+    # Explicitly cast pixel values to bfloat16 to match model weights.
+    if "pixel_values" in inputs:
+        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
     inputs = inputs.to(smol_model.device)
     streamer = TextIteratorStreamer(smol_processor, skip_prompt=True, skip_special_tokens=True)