Spaces:

prithivMLmods
/

FLUX-LoRA-DLC2

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 15

Commit

2d99b82

verified ·

1 Parent(s): eed6cef

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -155

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import spaces
 import torch
 import numpy as np
 from PIL import Image
-import edge_tts
 import cv2
 from transformers import (
@@ -24,7 +23,6 @@ from transformers import (
     Gemma3ForConditionalGeneration,
 )
 from transformers.image_utils import load_image
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 # Constants
 MAX_MAX_NEW_TOKENS = 2048
@@ -51,7 +49,7 @@ def progress_bar_html(label: str) -> str:
 </style>
     '''
-# TEXT & TTS MODELS
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -62,11 +60,6 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
-]
 # MULTIMODAL (OCR) MODELS
 MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
@@ -77,11 +70,6 @@ model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.float16
 ).to("cuda").eval()
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
 def clean_chat_history(chat_history):
     cleaned = []
     for msg in chat_history:
@@ -114,46 +102,9 @@ ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
 dtype = torch.float16 if device.type == "cuda" else torch.float32
-# STABLE DIFFUSION IMAGE GENERATION MODEL (Lightning 5 only)
-if torch.cuda.is_available():
-    pipe = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V5.0_Lightning",
-        torch_dtype=dtype,
-        use_safetensors=True,
-        add_watermarker=False
-    ).to(device)
-    pipe.text_encoder = pipe.text_encoder.half()
-    if ENABLE_CPU_OFFLOAD:
-        pipe.enable_model_cpu_offload()
-    else:
-        pipe.to(device)
-        print("Loaded RealVisXL_V5.0_Lightning on Device!")
-    if USE_TORCH_COMPILE:
-        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-        print("Model RealVisXL_V5.0_Lightning Compiled!")
-else:
-    pipe = StableDiffusionXLPipeline.from_pretrained(
-        "SG161222/RealVisXL_V5.0_Lightning",
-        torch_dtype=dtype,
-        use_safetensors=True,
-        add_watermarker=False
-    ).to(device)
-    print("Running on CPU; model loaded in float32.")
-DEFAULT_MODEL = "Lightning 5"
-models = {
-    "Lightning 5": pipe
-}
-def save_image(img: Image.Image) -> str:
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
 # GEMMA3-4B MULTIMODAL MODEL
-gemma3_model_id = "google/gemma-3-4b-it" #alter google/gemma-3-12b-it
 gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
     gemma3_model_id, device_map="auto"
 ).eval()
@@ -196,91 +147,51 @@ def generate(
     lower_text = text.lower().strip()
-    # IMAGE GENERATION BRANCH (Stable Diffusion model using @lightningv5)
-    if lower_text.startswith("@lightningv5"):
-        # Remove the model flag from the prompt.
-        prompt_clean = re.sub(r"@lightningv5", "", text, flags=re.IGNORECASE).strip().strip('"')
-        # Default parameters for single image generation.
-        width = 1024
-        height = 1024
-        guidance_scale = 6.0
-        seed_val = 0
-        randomize_seed_flag = True
-        seed_val = int(randomize_seed_fn(seed_val, randomize_seed_flag))
-        generator = torch.Generator(device=device).manual_seed(seed_val)
-        options = {
-            "prompt": prompt_clean,
-            "negative_prompt": default_negative,
-            "width": width,
-            "height": height,
-            "guidance_scale": guidance_scale,
-            "num_inference_steps": 30,
-            "generator": generator,
-            "num_images_per_prompt": 1,
-            "use_resolution_binning": True,
-            "output_type": "pil",
-        }
-        if device.type == "cuda":
-            torch.cuda.empty_cache()
-        yield progress_bar_html("Processing Image Generation")
-        images = models["Lightning 5"](**options).images
-        image_path = save_image(images[0])
-        yield gr.Image(image_path)
-        return
     # GEMMA3-4B TEXT & MULTIMODAL (image) Branch
     if lower_text.startswith("@gemma3"):
-        # If it is video, let the dedicated branch handle it.
-        if lower_text.startswith("@video-infer"):
-            pass  # video branch is handled below.
-        else:
-            # Remove the gemma3 flag from the prompt.
-            prompt_clean = re.sub(r"@gemma3", "", text, flags=re.IGNORECASE).strip().strip('"')
-            if files:
-                # If image files are provided, load them.
-                images = [load_image(f) for f in files]
-                messages = [{
-                    "role": "user",
-                    "content": [
-                        *[{"type": "image", "image": image} for image in images],
-                        {"type": "text", "text": prompt_clean},
-                    ]
-                }]
-            else:
-                messages = [
-                    {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
-                    {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
                 ]
-            inputs = gemma3_processor.apply_chat_template(
-                messages, add_generation_prompt=True, tokenize=True,
-                return_dict=True, return_tensors="pt"
-            ).to(gemma3_model.device, dtype=torch.bfloat16)
-            streamer = TextIteratorStreamer(
-                gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-            )
-            generation_kwargs = {
-                **inputs,
-                "streamer": streamer,
-                "max_new_tokens": max_new_tokens,
-                "do_sample": True,
-                "temperature": temperature,
-                "top_p": top_p,
-                "top_k": top_k,
-                "repetition_penalty": repetition_penalty,
-            }
-            thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
-            thread.start()
-            buffer = ""
-            yield progress_bar_html("Processing with Gemma3")
-            for new_text in streamer:
-                buffer += new_text
-                time.sleep(0.01)
-                yield buffer
-            return
     # GEMMA3-4B VIDEO Branch
     if lower_text.startswith("@video-infer"):
@@ -333,20 +244,9 @@ def generate(
             yield buffer
         return
-    # Otherwise, handle text/chat (and TTS) generation.
-    tts_prefix = "@tts"
-    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-    if is_tts and voice_index:
-        voice = TTS_VOICES[voice_index - 1]
-        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        conversation = [{"role": "user", "content": text}]
-    else:
-        voice = None
-        text = text.replace(tts_prefix, "").strip()
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
     if files:
         images = [load_image(image) for image in files] if len(files) > 1 else [load_image(files[0])]
@@ -400,10 +300,6 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
-        if is_tts and voice:
-            output_file = asyncio.run(text_to_speech(final_response, voice))
-            yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -422,16 +318,13 @@ demo = gr.ChatInterface(
         [{"text": "@video-infer Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
         [{"text": "@video-infer Summarize the events in this video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer What is in the video ?", "files": ["examples/redlight.mp4"]}],
-        ['@lightningv5 Chocolate dripping from a donut'],
         ["Python Program for Array Rotation"],
-        ["@tts1 Who is Nikola Tesla, and why did he die?"],
-        ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",
     description="# **Gemma 3 `@gemma3, @video-infer for video understanding`**",
     fill_height=True,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="@gemma3 for multimodal, @video-infer for video, @lightningv5 for image gen !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )

 import torch
 import numpy as np
 from PIL import Image
 import cv2
 from transformers import (
     Gemma3ForConditionalGeneration,
 )
 from transformers.image_utils import load_image
 # Constants
 MAX_MAX_NEW_TOKENS = 2048
 </style>
     '''
+# TEXT MODEL
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 )
 model.eval()
 # MULTIMODAL (OCR) MODELS
 MODEL_ID_VL = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
     torch_dtype=torch.float16
 ).to("cuda").eval()
 def clean_chat_history(chat_history):
     cleaned = []
     for msg in chat_history:
 dtype = torch.float16 if device.type == "cuda" else torch.float32
 # GEMMA3-4B MULTIMODAL MODEL
+gemma3_model_id = "google/gemma-3-4b-it"  # alternative: google/gemma-3-12b-it
 gemma3_model = Gemma3ForConditionalGeneration.from_pretrained(
     gemma3_model_id, device_map="auto"
 ).eval()
     lower_text = text.lower().strip()
     # GEMMA3-4B TEXT & MULTIMODAL (image) Branch
     if lower_text.startswith("@gemma3"):
+        # Remove the gemma3 flag from the prompt.
+        prompt_clean = re.sub(r"@gemma3", "", text, flags=re.IGNORECASE).strip().strip('"')
+        if files:
+            # If image files are provided, load them.
+            images = [load_image(f) for f in files]
+            messages = [{
+                "role": "user",
+                "content": [
+                    *[{"type": "image", "image": image} for image in images],
+                    {"type": "text", "text": prompt_clean},
                 ]
+            }]
+        else:
+            messages = [
+                {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
+                {"role": "user", "content": [{"type": "text", "text": prompt_clean}]}
+            ]
+        inputs = gemma3_processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True,
+            return_dict=True, return_tensors="pt"
+        ).to(gemma3_model.device, dtype=torch.bfloat16)
+        streamer = TextIteratorStreamer(
+            gemma3_processor.tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+        )
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        thread = Thread(target=gemma3_model.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing with Gemma3")
+        for new_text in streamer:
+            buffer += new_text
+            time.sleep(0.01)
+            yield buffer
+        return
     # GEMMA3-4B VIDEO Branch
     if lower_text.startswith("@video-infer"):
             yield buffer
         return
+    # Otherwise, handle text/chat generation.
+    conversation = clean_chat_history(chat_history)
+    conversation.append({"role": "user", "content": text})
     if files:
         images = [load_image(image) for image in files] if len(files) > 1 else [load_image(files[0])]
         final_response = "".join(outputs)
         yield final_response
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         [{"text": "@video-infer Explain what is happening in this video ?", "files": ["examples/oreo.mp4"]}],
         [{"text": "@video-infer Summarize the events in this video", "files": ["examples/sky.mp4"]}],
         [{"text": "@video-infer What is in the video ?", "files": ["examples/redlight.mp4"]}],
         ["Python Program for Array Rotation"],
     ],
     cache_examples=False,
     type="messages",
     description="# **Gemma 3 `@gemma3, @video-infer for video understanding`**",
     fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple", placeholder="@gemma3 for multimodal, @video-infer for video !"),
     stop_btn="Stop Generation",
     multimodal=True,
 )