Spaces:

prithivMLmods
/

FLUX-REALISM

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 6

Commit

65928b6

verified ·

1 Parent(s): 8a2ba41

Update app.py

Browse files

Files changed (1) hide show

app.py +239 -173

app.py CHANGED Viewed

@@ -4,10 +4,7 @@ import uuid
 import json
 import time
 import asyncio
-import re
 from threading import Thread
-from io import BytesIO
-import subprocess
 import gradio as gr
 import spaces
@@ -16,57 +13,98 @@ import numpy as np
 from PIL import Image
 import edge_tts
-# Install flash-attn without building CUDA kernels (if needed)
-subprocess.run(
-    'pip install flash-attn --no-build-isolation',
-    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
-    shell=True
 )
-from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
 from diffusers import DiffusionPipeline
-# ------------------------------------------------------------------------------
-# Global Configurations
-# ------------------------------------------------------------------------------
-DESCRIPTION = "# SmolVLM2 with Flux.1 Integration 📺"
-if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p>⚠️Running on CPU, This may not work on CPU.</p>"
 css = '''
 h1 {
   text-align: center;
   display: block;
 }
 '''
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# ------------------------------------------------------------------------------
-# FLUX.1 IMAGE GENERATION SETUP
-# ------------------------------------------------------------------------------
-MAX_SEED = np.iinfo(np.int32).max
-def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
-# Initialize Flux.1 pipeline
 base_model = "black-forest-labs/FLUX.1-dev"
 pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
 lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
-trigger_word = "Super Realism"  # Leave blank if no trigger word is needed.
 pipe.load_lora_weights(lora_repo)
 pipe.to("cuda")
-# Define style prompts for Flux.1
 style_list = [
     {
         "name": "3840 x 2160",
@@ -85,14 +123,48 @@ style_list = [
         "prompt": "{prompt}",
     },
 ]
-styles = {s["name"]: s["prompt"] for s in style_list}
 DEFAULT_STYLE_NAME = "3840 x 2160"
 STYLE_NAMES = list(styles.keys())
 def apply_style(style_name: str, positive: str) -> str:
     return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)
-def generate_image_flux(
     prompt: str,
     seed: int = 0,
     width: int = 1024,
@@ -100,8 +172,9 @@ def generate_image_flux(
     guidance_scale: float = 3,
     randomize_seed: bool = False,
     style_name: str = DEFAULT_STYLE_NAME,
 ):
-    """Generate an image using the Flux.1 pipeline with style prompts."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     positive_prompt = apply_style(style_name, prompt)
     if trigger_word:
@@ -118,38 +191,36 @@ def generate_image_flux(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ------------------------------------------------------------------------------
-# SMOLVLM2 MODEL SETUP
-# ------------------------------------------------------------------------------
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
-model = AutoModelForImageTextToText.from_pretrained(
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-    _attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16
-).to("cuda:0")
-# ------------------------------------------------------------------------------
-# CHAT / INFERENCE FUNCTION
-# ------------------------------------------------------------------------------
 @spaces.GPU
-def model_inference(input_dict, history, max_tokens):
     """
-    Implements a chat interface using SmolVLM2.
-    Special behavior:
-      - If the query text starts with "@image", the Flux.1 pipeline is used to generate an image.
-      - Otherwise, the query is processed with SmolVLM2.
-      - In the SmolVLM2 branch, a progress message "Processing with SmolVLM2..." is yielded.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # If the text begins with "@image", use Flux.1 image generation.
     if text.strip().lower().startswith("@image"):
-        prompt = text[len("@image"):].strip()
-        yield "Hold Tight Generating Flux.1 Image..."
-        image_paths, used_seed = generate_image_flux(
-            prompt=prompt,
             seed=1,
             width=1024,
             height=1024,
@@ -157,126 +228,121 @@ def model_inference(input_dict, history, max_tokens):
             randomize_seed=True,
             style_name=DEFAULT_STYLE_NAME,
         )
         yield gr.Image(image_paths[0])
-        return
-    # Default: Use SmolVLM2 inference.
-    yield "Processing with SmolVLM2..."
-    user_content = []
-    media_queue = []
-    # If no conversation history, process current input.
-    if not history:
-        text = text.strip()
-        for file in files:
-            if file.endswith((".png", ".jpg", ".jpeg", ".gif", ".bmp")):
-                media_queue.append({"type": "image", "path": file})
-            elif file.endswith((".mp4", ".mov", ".avi", ".mkv", ".flv")):
-                media_queue.append({"type": "video", "path": file})
-        if "<image>" in text or "<video>" in text:
-            parts = re.split(r'(<image>|<video>)', text)
-            for part in parts:
-                if part == "<image>" and media_queue:
-                    user_content.append(media_queue.pop(0))
-                elif part == "<video>" and media_queue:
-                    user_content.append(media_queue.pop(0))
-                elif part.strip():
-                    user_content.append({"type": "text", "text": part.strip()})
         else:
-            user_content.append({"type": "text", "text": text})
-            for media in media_queue:
-                user_content.append(media)
-        resulting_messages = [{"role": "user", "content": user_content}]
     else:
-        resulting_messages = []
-        user_content = []
-        media_queue = []
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], tuple):
-                file_name = hist["content"][0]
-                if file_name.endswith((".png", ".jpg", ".jpeg")):
-                    media_queue.append({"type": "image", "path": file_name})
-                elif file_name.endswith(".mp4"):
-                    media_queue.append({"type": "video", "path": file_name})
-        for hist in history:
-            if hist["role"] == "user" and isinstance(hist["content"], str):
-                text = hist["content"]
-                parts = re.split(r'(<image>|<video>)', text)
-                for part in parts:
-                    if part == "<image>" and media_queue:
-                        user_content.append(media_queue.pop(0))
-                    elif part == "<video>" and media_queue:
-                        user_content.append(media_queue.pop(0))
-                    elif part.strip():
-                        user_content.append({"type": "text", "text": part.strip()})
-            elif hist["role"] == "assistant":
-                resulting_messages.append({
-                    "role": "user",
-                    "content": user_content
-                })
-                resulting_messages.append({
-                    "role": "assistant",
-                    "content": [{"type": "text", "text": hist["content"]}]
-                })
-                user_content = []
-        if user_content:
-            resulting_messages.append({"role": "user", "content": user_content})
-    if text == "" and not files:
-        yield gr.Error("Please input a query and optionally image(s).")
-        return
-    if text == "" and files:
-        yield gr.Error("Please input a text query along with the image(s).")
-        return
-    print("resulting_messages", resulting_messages)
-    inputs = processor.apply_chat_template(
-        resulting_messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(model.device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_args = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
-    thread = Thread(target=model.generate, kwargs=generation_args)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)
-        yield buffer
-# ------------------------------------------------------------------------------
-# GRADIO CHAT INTERFACE
-# ------------------------------------------------------------------------------
-examples = [
-    [{"text": "Where do the severe droughts happen according to this diagram?", "files": ["example_images/examples_weather_events.png"]}],
-    [{"text": "What art era does this artpiece <image> and this artpiece <image> belong to?", "files": ["example_images/rococo.jpg", "example_images/rococo_1.jpg"]}],
-    [{"text": "Describe this image.", "files": ["example_images/mosque.jpg"]}],
-    [{"text": "When was this purchase made and how much did it cost?", "files": ["example_images/fiche.jpg"]}],
-    [{"text": "What is the date in this document?", "files": ["example_images/document.jpg"]}],
-    [{"text": "What is happening in the video?", "files": ["example_images/short.mp4"]}],
-    [{"text": "@image A futuristic cityscape with vibrant neon lights"}],
-]
 demo = gr.ChatInterface(
-    fn=model_inference,
-    title="SmolVLM2 with Flux.1 Integration 📺",
-    description="Play with SmolVLM2 (HuggingFaceTB/SmolVLM2-2.2B-Instruct) with integrated Flux.1 image generation. Use the '@image' prefix to generate images with Flux.1.",
-    examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", ".mp4"], file_count="multiple"),
     stop_btn="Stop Generation",
     multimodal=True,
-    cache_examples=False,
-    additional_inputs=[gr.Slider(minimum=100, maximum=500, step=50, value=200, label="Max Tokens")],
-    type="messages"
 )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
 from PIL import Image
 import edge_tts
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
 )
+from transformers.image_utils import load_image
 from diffusers import DiffusionPipeline
+DESCRIPTION = """
+# QwQ Edge 💬 with Flux.1
+"""
 css = '''
 h1 {
   text-align: center;
   display: block;
 }
+#duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
+}
 '''
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# --------------------------
+# Text Generation Components
+# --------------------------
+# Load text-only model and tokenizer
+model_id = "prithivMLmods/FastThink-0.5B-Tiny"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+model.eval()
+TTS_VOICES = [
+    "en-US-JennyNeural",  # @tts1
+    "en-US-GuyNeural",    # @tts2
+]
+# Multimodal model (text+vision)
+MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
+processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+model_m = Qwen2VLForConditionalGeneration.from_pretrained(
+    MODEL_ID,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to("cuda").eval()
+async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
+    """Convert text to speech using Edge TTS and save as MP3"""
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(output_file)
+    return output_file
+def clean_chat_history(chat_history):
+    """
+    Filter out any chat entries whose "content" is not a string.
+    This helps prevent errors when concatenating previous messages.
+    """
+    cleaned = []
+    for msg in chat_history:
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
+# --------------------------
+# Flux.1 Image Generation
+# --------------------------
+# Set up the Flux.1 pipeline
 base_model = "black-forest-labs/FLUX.1-dev"
 pipe = DiffusionPipeline.from_pretrained(base_model, torch_dtype=torch.bfloat16)
 lora_repo = "strangerzonehf/Flux-Super-Realism-LoRA"
+trigger_word = "Super Realism"  # Leave trigger_word blank if not used.
 pipe.load_lora_weights(lora_repo)
 pipe.to("cuda")
+# Define style prompts
 style_list = [
     {
         "name": "3840 x 2160",
         "prompt": "{prompt}",
     },
 ]
+styles = {k["name"]: k["prompt"] for k in style_list}
 DEFAULT_STYLE_NAME = "3840 x 2160"
 STYLE_NAMES = list(styles.keys())
 def apply_style(style_name: str, positive: str) -> str:
     return styles.get(style_name, styles[DEFAULT_STYLE_NAME]).replace("{prompt}", positive)
+MAX_SEED = np.iinfo(np.int32).max
+def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
+    unique_name = str(uuid.uuid4()) + ".png"
+    img.save(unique_name)
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def progress_bar_html(label: str) -> str:
+    """
+    Returns an HTML snippet for a thin progress bar with a label.
+    The progress bar is styled as a dark red animated bar.
+    """
+    return f'''
+<div style="display: flex; align-items: center;">
+    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
+    <div style="width: 110px; height: 5px; background-color: #f0f0f0; border-radius: 2px; overflow: hidden;">
+        <div style="width: 100%; height: 100%; background-color: #ff5900; animation: loading 1.5s linear infinite;"></div>
+    </div>
+</div>
+<style>
+@keyframes loading {{
+    0% {{ transform: translateX(-100%); }}
+    100% {{ transform: translateX(100%); }}
+}}
+</style>
+    '''
+@spaces.GPU(duration=60, enable_queue=True)
+def generate_image_fn(
     prompt: str,
     seed: int = 0,
     width: int = 1024,
     guidance_scale: float = 3,
     randomize_seed: bool = False,
     style_name: str = DEFAULT_STYLE_NAME,
+    progress=gr.Progress(track_tqdm=True),
 ):
+    """Generate images using the Flux.1 pipeline."""
     seed = int(randomize_seed_fn(seed, randomize_seed))
     positive_prompt = apply_style(style_name, prompt)
     if trigger_word:
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# --------------------------
+# Chat and Multimodal Generation
+# --------------------------
 @spaces.GPU
+def generate(
+    input_dict: dict,
+    chat_history: list[dict],
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+):
     """
+    Generates chatbot responses with support for multimodal input, TTS, and image generation using Flux.1.
+    Special commands:
+      - "@tts1" or "@tts2": triggers text-to-speech.
+      - "@image": triggers image generation using the Flux.1 pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
+        # Remove the "@image" tag and use the rest as prompt
+        prompt_img = text[len("@image"):].strip()
+        # Show animated progress bar for image generation
+        yield progress_bar_html("Generating Image")
+        image_paths, used_seed = generate_image_fn(
+            prompt=prompt_img,
             seed=1,
             width=1024,
             height=1024,
             randomize_seed=True,
             style_name=DEFAULT_STYLE_NAME,
         )
+        # Once done, yield the generated image
         yield gr.Image(image_paths[0])
+        return  # Exit early
+    tts_prefix = "@tts"
+    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
+    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
+    if is_tts and voice_index:
+        voice = TTS_VOICES[voice_index - 1]
+        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear previous chat history for a fresh TTS request.
+        conversation = [{"role": "user", "content": text}]
+    else:
+        voice = None
+        # Remove any stray @tts tags and build the conversation history.
+        text = text.replace(tts_prefix, "").strip()
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
+    if files:
+        if len(files) > 1:
+            images = [load_image(image) for image in files]
+        elif len(files) == 1:
+            images = [load_image(files[0])]
         else:
+            images = []
+        messages = [{
+            "role": "user",
+            "content": [
+                *[{"type": "image", "image": image} for image in images],
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_multimodal = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_multimodal], images=images, return_tensors="pt", padding=True).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        # Show animated progress bar for multimodal generation
+        yield progress_bar_html("Thinking...")
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
     else:
+        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(model.device)
+        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        # Show animated progress bar for text generation
+        yield progress_bar_html("Thinking...")
+        for new_text in streamer:
+            outputs.append(new_text)
+            yield "".join(outputs)
+        final_response = "".join(outputs)
+        yield final_response
+        # If TTS was requested, convert the final response to speech.
+        if is_tts and voice:
+            output_file = asyncio.run(text_to_speech(final_response, voice))
+            yield gr.Audio(output_file, autoplay=True)
+# --------------------------
+# Gradio Chat Interface
+# --------------------------
 demo = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
+        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
+        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
+    ],
+    examples=[
+        ["@image A futuristic cityscape at sunset with vibrant colors"],
+        ["Python Program for Array Rotation"],
+        ["@tts1 Who is Nikola Tesla, and why did he die?"],
+        [{"text": "Extract JSON from the image", "files": ["examples/document.jpg"]}],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
+        ["@tts2 What causes rainbows to form?"],
+    ],
+    cache_examples=False,
+    type="messages",
+    description=DESCRIPTION,
+    css=css,
+    fill_height=True,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple", placeholder="‎ @tts1, @tts2-voices, @image-image gen, default [text, vision]"),
     stop_btn="Stop Generation",
     multimodal=True,
 )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(share=True)