Spaces:

prithivMLmods
/

DocScope-R1

Running on Zero

App Files Files Community

prithivMLmods commited on Apr 21

Commit

f5e2b63

verified ·

1 Parent(s): 8419dc4

Update app.py

Browse files

Files changed (1) hide show

app.py +428 -736

app.py CHANGED Viewed

@@ -1,756 +1,448 @@
-import os
-import random
-import uuid
-import json
-import time
-import asyncio
-import tempfile
-from threading import Thread
-import base64
-import shutil
-import re
 import gradio as gr
 import spaces
 import torch
 import numpy as np
-from PIL import Image
-import edge_tts
-import trimesh
-import soundfile as sf  # New import for audio file reading
-import supervision as sv
-from ultralytics import YOLO as YOLODetector
-from huggingface_hub import hf_hub_download
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-    Qwen2VLForConditionalGeneration,
-    AutoProcessor,
 )
-from transformers.image_utils import load_image
-from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
-from diffusers.utils import export_to_ply
-os.system('pip install backoff')
-# Global constants and helper functions
-MAX_SEED = np.iinfo(np.int32).max
-def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    return seed
-def glb_to_data_url(glb_path: str) -> str:
-    """
-    Reads a GLB file from disk and returns a data URL with a base64 encoded representation.
-    (Not used in this method.)
-    """
-    with open(glb_path, "rb") as f:
-        data = f.read()
-    b64_data = base64.b64encode(data).decode("utf-8")
-    return f"data:model/gltf-binary;base64,{b64_data}"
-def progress_bar_html(label: str) -> str:
-    """
-    Returns an HTML snippet for a thin progress bar with a label.
-    The progress bar is styled as a dark red animated bar.
-    """
-    return f'''
-<div style="display: flex; align-items: center;">
-    <span style="margin-right: 10px; font-size: 14px;">{label}</span>
-    <div style="width: 110px; height: 5px; background-color: #AFEEEE; border-radius: 2px; overflow: hidden;">
-        <div style="width: 100%; height: 100%; background-color: #00FFFF; animation: loading 1.5s linear infinite;"></div>
-    </div>
-</div>
-<style>
-@keyframes loading {{
-    0% {{ transform: translateX(-100%); }}
-    100% {{ transform: translateX(100%); }}
-}}
-</style>
-    '''
-# Model class for Text-to-3D Generation (ShapE)
-class Model:
-    def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
-        self.pipe.to(self.device)
-        # Ensure the text encoder is in half precision to avoid dtype mismatches.
-        if torch.cuda.is_available():
-            try:
-                self.pipe.text_encoder = self.pipe.text_encoder.half()
-            except AttributeError:
-                pass
-        self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
-        self.pipe_img.to(self.device)
-        # Use getattr with a default value to avoid AttributeError if text_encoder is missing.
-        if torch.cuda.is_available():
-            text_encoder_img = getattr(self.pipe_img, "text_encoder", None)
-            if text_encoder_img is not None:
-                self.pipe_img.text_encoder = text_encoder_img.half()
-    def to_glb(self, ply_path: str) -> str:
-        mesh = trimesh.load(ply_path)
-        # Rotate the mesh for proper orientation
-        rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
-        mesh.apply_transform(rot)
-        rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
-        mesh.apply_transform(rot)
-        mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
-        mesh.export(mesh_path.name, file_type="glb")
-        return mesh_path.name
-    def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        images = self.pipe(
-            prompt,
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_steps,
-            output_type="mesh",
-        ).images
-        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
-        export_to_ply(images[0], ply_path.name)
-        return self.to_glb(ply_path.name)
-    def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        images = self.pipe_img(
-            image,
-            generator=generator,
-            guidance_scale=guidance_scale,
-            num_inference_steps=num_steps,
-            output_type="mesh",
-        ).images
-        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
-        export_to_ply(images[0], ply_path.name)
-        return self.to_glb(ply_path.name)
-# New Tools for Web Functionality using DuckDuckGo and smolagents
-from typing import Any, Optional
-from smolagents.tools import Tool
-import duckduckgo_search
-class DuckDuckGoSearchTool(Tool):
-    name = "web_search"
-    description = "Performs a duckduckgo web search based on your query (think a Google search) then returns the top search results."
-    inputs = {'query': {'type': 'string', 'description': 'The search query to perform.'}}
-    output_type = "string"
-    def __init__(self, max_results=10, **kwargs):
-        super().__init__()
-        self.max_results = max_results
-        try:
-            from duckduckgo_search import DDGS
-        except ImportError as e:
-            raise ImportError(
-                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
-            ) from e
-        self.ddgs = DDGS(**kwargs)
-    def forward(self, query: str) -> str:
-        results = self.ddgs.text(query, max_results=self.max_results)
-        if len(results) == 0:
-            raise Exception("No results found! Try a less restrictive/shorter query.")
-        postprocessed_results = [
-            f"[{result['title']}]({result['href']})\n{result['body']}" for result in results
-        ]
-        return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
-class VisitWebpageTool(Tool):
-    name = "visit_webpage"
-    description = "Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
-    inputs = {'url': {'type': 'string', 'description': 'The url of the webpage to visit.'}}
-    output_type = "string"
-    def __init__(self, *args, **kwargs):
-        self.is_initialized = False
-    def forward(self, url: str) -> str:
-        try:
-            import requests
-            from markdownify import markdownify
-            from requests.exceptions import RequestException
-            from smolagents.utils import truncate_content
-        except ImportError as e:
-            raise ImportError(
-                "You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests`."
-            ) from e
-        try:
-            # Send a GET request to the URL with a 20-second timeout
-            response = requests.get(url, timeout=20)
-            response.raise_for_status()  # Raise an exception for bad status codes
-            # Convert the HTML content to Markdown
-            markdown_content = markdownify(response.text).strip()
-            # Remove multiple line breaks
-            markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
-            return truncate_content(markdown_content, 10000)
-        except requests.exceptions.Timeout:
-            return "The request timed out. Please try again later or check the URL."
-        except RequestException as e:
-            return f"Error fetching the webpage: {str(e)}"
-        except Exception as e:
-            return f"An unexpected error occurred: {str(e)}"
-# rAgent Reasoning using Llama mode OpenAI
-from openai import OpenAI
-ACCESS_TOKEN = os.getenv("HF_TOKEN")
-ragent_client = OpenAI(
-    base_url="https://api-inference.huggingface.co/v1/",
-    api_key=ACCESS_TOKEN,
 )
-SYSTEM_PROMPT = """
-        "You are an expert assistant who solves tasks using Python code. Follow these steps:\n"
-        "1. **Thought**: Explain your reasoning and plan for solving the task.\n"
-        "2. **Code**: Write Python code to implement your solution.\n"
-        "3. **Observation**: Analyze the output of the code and summarize the results.\n"
-        "4. **Final Answer**: Provide a concise conclusion or final result.\n\n"
-        f"Task: {{task}}"
 """
-def ragent_reasoning(prompt: str, history: list[dict], max_tokens: int = 2048, temperature: float = 0.7, top_p: float = 0.95):
-    """
-    Uses the Llama mode OpenAI model to perform a structured reasoning chain.
-    """
-    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    # Incorporate conversation history (if any)
-    for msg in history:
-        if msg.get("role") == "user":
-            messages.append({"role": "user", "content": msg["content"]})
-        elif msg.get("role") == "assistant":
-            messages.append({"role": "assistant", "content": msg["content"]})
-    messages.append({"role": "user", "content": prompt})
-    response = ""
-    stream = ragent_client.chat.completions.create(
-         model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-         max_tokens=max_tokens,
-         stream=True,
-         temperature=temperature,
-         top_p=top_p,
-         messages=messages,
-    )
-    for message in stream:
-         token = message.choices[0].delta.content
-         response += token
-         yield response
-# ------------------------------------------------------------------------------
-# New Phi-4 Multimodal Feature (Image & Audio)
-# ------------------------------------------------------------------------------
-# Define prompt structure for Phi-4
-phi4_user_prompt = '<|user|>'
-phi4_assistant_prompt = '<|assistant|>'
-phi4_prompt_suffix = '<|end|>'
-# Load Phi-4 multimodal model and processor using unique variable names
-phi4_model_path = "microsoft/Phi-4-multimodal-instruct"
-phi4_processor = AutoProcessor.from_pretrained(phi4_model_path, trust_remote_code=True)
-phi4_model = AutoModelForCausalLM.from_pretrained(
-    phi4_model_path,
-    device_map="auto",
-    torch_dtype="auto",
-    trust_remote_code=True,
-    _attn_implementation="eager",
-)
-MAX_MAX_NEW_TOKENS = 2048
-DEFAULT_MAX_NEW_TOKENS = 1024
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load Models and Pipelines for Chat, Image, and Multimodal Processing
-# Load the text-only model and tokenizer (for pure text chat)
-model_id = "prithivMLmods/Ganymede-Llama-3.3-3B-Preview" #prithivMLmods/FastThink-0.5B-Tiny
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-)
-model.eval()
-# Voices for text-to-speech
-TTS_VOICES = [
-    "en-US-JennyNeural",  # @tts1
-    "en-US-GuyNeural",    # @tts2
-]
-# Load multimodal processor and model (e.g. for OCR and image processing)
-MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model_m = Qwen2VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to("cuda").eval()
-# Asynchronous text-to-speech
-async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
-    """Convert text to speech using Edge TTS and save as MP3"""
-    communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(output_file)
-    return output_file
-# Utility function to clean conversation history
-def clean_chat_history(chat_history):
-    """
-    Filter out any chat entries whose "content" is not a string.
-    This helps prevent errors when concatenating previous messages.
-    """
-    cleaned = []
-    for msg in chat_history:
-        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
-            cleaned.append(msg)
-    return cleaned
-# Stable Diffusion XL Pipeline for Image Generation
-# Model In Use : SG161222/RealVisXL_V5.0_Lightning
-MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
-MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
-USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
-ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For batched image generation
-sd_pipe = StableDiffusionXLPipeline.from_pretrained(
-    MODEL_ID_SD,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    use_safetensors=True,
-    add_watermarker=False,
-).to(device)
-sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
-if torch.cuda.is_available():
-    sd_pipe.text_encoder = sd_pipe.text_encoder.half()
-if USE_TORCH_COMPILE:
-    sd_pipe.compile()
-if ENABLE_CPU_OFFLOAD:
-    sd_pipe.enable_model_cpu_offload()
-def save_image(img: Image.Image) -> str:
-    """Save a PIL image with a unique filename and return the path."""
-    unique_name = str(uuid.uuid4()) + ".png"
-    img.save(unique_name)
-    return unique_name
-@spaces.GPU(duration=60, enable_queue=True)
-# SG161222/RealVisXL_V5.0_Lightning
-def generate_image_fn(
-    prompt: str,
-    negative_prompt: str = "",
-    use_negative_prompt: bool = False,
-    seed: int = 1,
-    width: int = 1024,
-    height: int = 1024,
-    guidance_scale: float = 3,
-    num_inference_steps: int = 25,
-    randomize_seed: bool = False,
-    use_resolution_binning: bool = True,
-    num_images: int = 1,
-    progress=gr.Progress(track_tqdm=True),
-):
-    """Generate images using the SDXL pipeline."""
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    generator = torch.Generator(device=device).manual_seed(seed)
-    options = {
-        "prompt": [prompt] * num_images,
-        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
-        "width": width,
-        "height": height,
-        "guidance_scale": guidance_scale,
-        "num_inference_steps": num_inference_steps,
-        "generator": generator,
-        "output_type": "pil",
-    }
-    if use_resolution_binning:
-        options["use_resolution_binning"] = True
-    images = []
-    # Process in batches
-    for i in range(0, num_images, BATCH_SIZE):
-        batch_options = options.copy()
-        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
-        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
-            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
-        if device.type == "cuda":
-            with torch.autocast("cuda", dtype=torch.float16):
-                outputs = sd_pipe(**batch_options)
-        else:
-            outputs = sd_pipe(**batch_options)
-        images.extend(outputs.images)
-    image_paths = [save_image(img) for img in images]
-    return image_paths, seed
-# Text-to-3D Generation using the ShapE Pipeline
-@spaces.GPU(duration=120, enable_queue=True)
-def generate_3d_fn(
-    prompt: str,
-    seed: int = 1,
-    guidance_scale: float = 15.0,
-    num_steps: int = 64,
-    randomize_seed: bool = False,
-):
-    """
-    Generate a 3D model from text using the ShapE pipeline.
-    Returns a tuple of (glb_file_path, used_seed).
-    """
-    seed = int(randomize_seed_fn(seed, randomize_seed))
-    model3d = Model()
-    glb_path = model3d.run_text(prompt, seed=seed, guidance_scale=guidance_scale, num_steps=num_steps)
-    return glb_path, seed
-# YOLO Object Detection Setup
-YOLO_MODEL_REPO = "strangerzonehf/Flux-Ultimate-LoRA-Collection"
-YOLO_CHECKPOINT_NAME = "images/demo.pt"
-yolo_model_path = hf_hub_download(repo_id=YOLO_MODEL_REPO, filename=YOLO_CHECKPOINT_NAME)
-yolo_detector = YOLODetector(yolo_model_path)
-def detect_objects(image: np.ndarray):
-    """Runs object detection on the input image."""
-    results = yolo_detector(image, verbose=False)[0]
-    detections = sv.Detections.from_ultralytics(results).with_nms()
-    box_annotator = sv.BoxAnnotator()
-    label_annotator = sv.LabelAnnotator()
-    annotated_image = image.copy()
-    annotated_image = box_annotator.annotate(scene=annotated_image, detections=detections)
-    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
-    return Image.fromarray(annotated_image)
-# Chat Generation Function with support for @tts, @image, @3d, @web, @rAgent, @yolo, and now @phi4 commands
-@spaces.GPU
-def generate(
-    input_dict: dict,
-    chat_history: list[dict],
-    max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
-):
-    """
-    Generates chatbot responses with support for multimodal input and special commands:
-      - "@tts1" or "@tts2": triggers text-to-speech.
-      - "@image": triggers image generation using the SDXL pipeline.
-      - "@3d": triggers 3D model generation using the ShapE pipeline.
-      - "@web": triggers a web search or webpage visit.
-      - "@rAgent": initiates a reasoning chain using Llama mode.
-      - "@yolo": triggers object detection using YOLO.
-      - **"@phi4": triggers multimodal (image/audio) processing using the Phi-4 model.**
-    """
-    text = input_dict["text"]
-    files = input_dict.get("files", [])
-    # --- 3D Generation branch ---
-    if text.strip().lower().startswith("@3d"):
-        prompt = text[len("@3d"):].strip()
-        yield progress_bar_html("Processing 3D Mesh Generation")
-        glb_path, used_seed = generate_3d_fn(
-            prompt=prompt,
-            seed=1,
-            guidance_scale=15.0,
-            num_steps=64,
-            randomize_seed=True,
-        )
-        # Copy the GLB file to a static folder.
-        yield progress_bar_html("Finalizing 3D Mesh Generation")
-        static_folder = os.path.join(os.getcwd(), "static")
-        if not os.path.exists(static_folder):
-            os.makedirs(static_folder)
-        new_filename = f"mesh_{uuid.uuid4()}.glb"
-        new_filepath = os.path.join(static_folder, new_filename)
-        shutil.copy(glb_path, new_filepath)
-        yield gr.File(new_filepath)
-        return
-    # --- Image Generation branch ---
-    if text.strip().lower().startswith("@image"):
-        prompt = text[len("@image"):].strip()
-        yield progress_bar_html("Generating Image")
-        image_paths, used_seed = generate_image_fn(
-            prompt=prompt,
-            negative_prompt="",
-            use_negative_prompt=False,
-            seed=1,
-            width=1024,
-            height=1024,
-            guidance_scale=3,
-            num_inference_steps=25,
-            randomize_seed=True,
-            use_resolution_binning=True,
-            num_images=1,
-        )
-        yield gr.Image(image_paths[0])
-        return
-    # --- Web Search/Visit branch ---
-    if text.strip().lower().startswith("@web"):
-        web_command = text[len("@web"):].strip()
-        # If the command starts with "visit", then treat the rest as a URL
-        if web_command.lower().startswith("visit"):
-            url = web_command[len("visit"):].strip()
-            yield progress_bar_html("Visiting Webpage")
-            visitor = VisitWebpageTool()
-            content = visitor.forward(url)
-            yield content
-        else:
-            # Otherwise, treat the rest as a search query.
-            query = web_command
-            yield progress_bar_html("Performing Web Search")
-            searcher = DuckDuckGoSearchTool()
-            results = searcher.forward(query)
-            yield results
-        return
-    # --- rAgent Reasoning branch ---
-    if text.strip().lower().startswith("@ragent"):
-        prompt = text[len("@ragent"):].strip()
-        yield progress_bar_html("Processing Reasoning Chain")
-        # Pass the current chat history (cleaned) to help inform the chain.
-        for partial in ragent_reasoning(prompt, clean_chat_history(chat_history)):
-            yield partial
-        return
-    # --- YOLO Object Detection branch ---
-    if text.strip().lower().startswith("@yolo"):
-        yield progress_bar_html("Performing Object Detection")
-        if not files or len(files) == 0:
-            yield "Error: Please attach an image for YOLO object detection."
-            return
-        # Use the first attached image
-        input_file = files[0]
-        try:
-            if isinstance(input_file, str):
-                pil_image = Image.open(input_file)
-            else:
-                pil_image = input_file
-        except Exception as e:
-            yield f"Error loading image: {str(e)}"
-            return
-        np_image = np.array(pil_image)
-        result_img = detect_objects(np_image)
-        yield gr.Image(result_img)
-        return
-    # --- Phi-4 Multimodal branch (Image/Audio) with Streaming ---
-    if text.strip().lower().startswith("@phi4"):
-        question = text[len("@phi4"):].strip()
-        if not files:
-            yield "Error: Please attach an image or audio file for @phi4 multimodal processing."
-            return
-        if not question:
-            yield "Error: Please provide a question after @phi4."
-            return
-        # Determine input type (Image or Audio) from the first file
-        input_file = files[0]
-        try:
-            # If file is already a PIL Image, treat as image
-            if isinstance(input_file, Image.Image):
-                input_type = "Image"
-                file_for_phi4 = input_file
-            else:
-                # Try opening as image; if it fails, assume audio
-                try:
-                    file_for_phi4 = Image.open(input_file)
-                    input_type = "Image"
-                except Exception:
-                    input_type = "Audio"
-                    file_for_phi4 = input_file
-        except Exception:
-            input_type = "Audio"
-            file_for_phi4 = input_file
-        if input_type == "Image":
-            phi4_prompt = f'{phi4_user_prompt}<|image_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
-            inputs = phi4_processor(text=phi4_prompt, images=file_for_phi4, return_tensors='pt').to(phi4_model.device)
-        elif input_type == "Audio":
-            phi4_prompt = f'{phi4_user_prompt}<|audio_1|>{question}{phi4_prompt_suffix}{phi4_assistant_prompt}'
-            audio, samplerate = sf.read(file_for_phi4)
-            inputs = phi4_processor(text=phi4_prompt, audios=[(audio, samplerate)], return_tensors='pt').to(phi4_model.device)
-        else:
-            yield "Invalid file type for @phi4 multimodal processing."
-            return
-        # Initialize the streamer
-        streamer = TextIteratorStreamer(phi4_processor, skip_prompt=True, skip_special_tokens=True)
-        # Prepare generation kwargs
-        generation_kwargs = {
-            **inputs,
-            "streamer": streamer,
-            "max_new_tokens": 200,
-            "num_logits_to_keep": 0,
-        }
-        # Start generation in a separate thread
-        thread = Thread(target=phi4_model.generate, kwargs=generation_kwargs)
-        thread.start()
-        # Stream the response
-        buffer = ""
-        yield progress_bar_html("Processing Phi-4 Multimodal")
-        for new_text in streamer:
-            buffer += new_text
-            time.sleep(0.01)  # Small delay to simulate real-time streaming
-            yield buffer
-        return
-    # --- Text and TTS branch ---
-    tts_prefix = "@tts"
-    is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
-    voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
-    if is_tts and voice_index:
-        voice = TTS_VOICES[voice_index - 1]
-        text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        conversation = [{"role": "user", "content": text}]
-    else:
-        voice = None
-        text = text.replace(tts_prefix, "").strip()
-        conversation = clean_chat_history(chat_history)
-        conversation.append({"role": "user", "content": text})
-    if files:
-        if len(files) > 1:
-            images = [load_image(image) for image in files]
-        elif len(files) == 1:
-            images = [load_image(files[0])]
-        else:
-            images = []
-        messages = [{
-            "role": "user",
-            "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
-            ]
-        }]
-        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
-        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
-        thread.start()
-        buffer = ""
-        yield progress_bar_html("Processing with Qwen2VL OCR")
-        for new_text in streamer:
-            buffer += new_text
-            buffer = buffer.replace("<|im_end|>", "")
-            time.sleep(0.01)
-            yield buffer
-    else:
-        input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-        input_ids = input_ids.to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = {
-            "input_ids": input_ids,
-            "streamer": streamer,
-            "max_new_tokens": max_new_tokens,
-            "do_sample": True,
-            "top_p": top_p,
-            "top_k": top_k,
-            "temperature": temperature,
-            "num_beams": 1,
-            "repetition_penalty": repetition_penalty,
-        }
-        t = Thread(target=model.generate, kwargs=generation_kwargs)
-        t.start()
-        outputs = []
-        yield progress_bar_html("Processing Chat Response")
-        for new_text in streamer:
-            outputs.append(new_text)
-            yield "".join(outputs)
-        final_response = "".join(outputs)
-        yield final_response
-        if is_tts and voice:
-            output_file = asyncio.run(text_to_speech(final_response, voice))
-            yield gr.Audio(output_file, autoplay=True)
-# Gradio Chat Interface Setup and Launch
-demo = gr.ChatInterface(
-    fn=generate,
-    additional_inputs=[
-        gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS),
-        gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6),
-        gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
-        gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50),
-        gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2),
-    ],
-    examples=[
-        [{"text": "@phi4 Transcribe the audio to text.", "files": ["examples/harvard.wav"]}],
-        [{"text": "@phi4 Summarize the content", "files": ["examples/write.jpg"]}],
-        [{"text": "Explain the Image", "files": ["examples/3.jpg"]}],
-        [{"text": "Extract as JSON table from the table", "files": ["examples/4.jpg"]}],
-        ["@image Chocolate dripping from a donut"],
-        ["@3d A birthday cupcake with cherry"],
-        ["@image A drawing of an man made out of hamburger, blue sky background, soft pastel colors"],
-        ["@tts2 What causes rainbows to form?"],
-        [{"text": "Summarize the letter", "files": ["examples/1.png"]}],
-        [{"text": "@yolo", "files": ["examples/yolo.jpeg"]}],
-        ["@rAgent Explain how a binary search algorithm works."],
-        ["@web Is Grok-3 Beats DeepSeek-R1 at Reasoning ?"],
-        ["@tts1 Explain Tower of Hanoi"],
-        ["Python Program for Array Rotation"],
-    ],
-    cache_examples=False,
-    type="messages",
-    description="# **Agent Dino `@phi4 'prompt..', @image, etc..`**",
-    fill_height=True,
-    textbox=gr.MultimodalTextbox(
-        label="Query Input",
-        file_types=["image", "audio"],
-        file_count="multiple",
-        placeholder="‎ @tts1, @tts2, @image, @3d, @phi4 [image, audio], @rAgent, @web, @yolo, default [plain text]"
-    ),
-    stop_btn="Stop Generation",
-    multimodal=True,
-)
-# Ensure the static folder exists
-if not os.path.exists("static"):
-    os.makedirs("static")
-from fastapi.staticfiles import StaticFiles
-demo.app.mount("/static", StaticFiles(directory="static"), name="static")
-if __name__ == "__main__":
-    demo.queue(max_size=20).launch(share=True)

 import gradio as gr
 import spaces
 import torch
+from diffusers import AutoencoderKL, TCDScheduler
+from diffusers.models.model_loading_utils import load_state_dict
+from gradio_imageslider import ImageSlider
+from huggingface_hub import hf_hub_download
+from controlnet_union import ControlNetModel_Union
+from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
+from PIL import Image, ImageDraw
 import numpy as np
+config_file = hf_hub_download(
+    "xinsir/controlnet-union-sdxl-1.0",
+    filename="config_promax.json",
+)
+config = ControlNetModel_Union.load_config(config_file)
+controlnet_model = ControlNetModel_Union.from_config(config)
+model_file = hf_hub_download(
+    "xinsir/controlnet-union-sdxl-1.0",
+    filename="diffusion_pytorch_model_promax.safetensors",
 )
+state_dict = load_state_dict(model_file)
+model, _, _, _, _ = ControlNetModel_Union._load_pretrained_model(
+    controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0"
 )
+model.to(device="cuda", dtype=torch.float16)
+vae = AutoencoderKL.from_pretrained(
+    "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+).to("cuda")
+pipe = StableDiffusionXLFillPipeline.from_pretrained(
+    "SG161222/RealVisXL_V5.0_Lightning",
+    torch_dtype=torch.float16,
+    vae=vae,
+    controlnet=model,
+    variant="fp16",
+).to("cuda")
+pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+def can_expand(source_width, source_height, target_width, target_height, alignment):
+    """Checks if the image can be expanded based on the alignment."""
+    if alignment in ("Left", "Right") and source_width >= target_width:
+        return False
+    if alignment in ("Top", "Bottom") and source_height >= target_height:
+        return False
+    return True
+def prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
+    target_size = (width, height)
+    # Calculate the scaling factor to fit the image within the target size
+    scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
+    new_width = int(image.width * scale_factor)
+    new_height = int(image.height * scale_factor)
+    # Resize the source image to fit within target size
+    source = image.resize((new_width, new_height), Image.LANCZOS)
+    # Apply resize option using percentages
+    if resize_option == "Full":
+        resize_percentage = 100
+    elif resize_option == "50%":
+        resize_percentage = 50
+    elif resize_option == "33%":
+        resize_percentage = 33
+    elif resize_option == "25%":
+        resize_percentage = 25
+    else:  # Custom
+        resize_percentage = custom_resize_percentage
+    # Calculate new dimensions based on percentage
+    resize_factor = resize_percentage / 100
+    new_width = int(source.width * resize_factor)
+    new_height = int(source.height * resize_factor)
+    # Ensure minimum size of 64 pixels
+    new_width = max(new_width, 64)
+    new_height = max(new_height, 64)
+    # Resize the image
+    source = source.resize((new_width, new_height), Image.LANCZOS)
+    # Calculate the overlap in pixels based on the percentage
+    overlap_x = int(new_width * (overlap_percentage / 100))
+    overlap_y = int(new_height * (overlap_percentage / 100))
+    # Ensure minimum overlap of 1 pixel
+    overlap_x = max(overlap_x, 1)
+    overlap_y = max(overlap_y, 1)
+    # Calculate margins based on alignment
+    if alignment == "Middle":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "Left":
+        margin_x = 0
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "Right":
+        margin_x = target_size[0] - new_width
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "Top":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = 0
+    elif alignment == "Bottom":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = target_size[1] - new_height
+    # Adjust margins to eliminate gaps
+    margin_x = max(0, min(margin_x, target_size[0] - new_width))
+    margin_y = max(0, min(margin_y, target_size[1] - new_height))
+    # Create a new background image and paste the resized source image
+    background = Image.new('RGB', target_size, (255, 255, 255))
+    background.paste(source, (margin_x, margin_y))
+    # Create the mask
+    mask = Image.new('L', target_size, 255)
+    mask_draw = ImageDraw.Draw(mask)
+    # Calculate overlap areas
+    white_gaps_patch = 2
+    left_overlap = margin_x + overlap_x if overlap_left else margin_x + white_gaps_patch
+    right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width - white_gaps_patch
+    top_overlap = margin_y + overlap_y if overlap_top else margin_y + white_gaps_patch
+    bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height - white_gaps_patch
+    if alignment == "Left":
+        left_overlap = margin_x + overlap_x if overlap_left else margin_x
+    elif alignment == "Right":
+        right_overlap = margin_x + new_width - overlap_x if overlap_right else margin_x + new_width
+    elif alignment == "Top":
+        top_overlap = margin_y + overlap_y if overlap_top else margin_y
+    elif alignment == "Bottom":
+        bottom_overlap = margin_y + new_height - overlap_y if overlap_bottom else margin_y + new_height
+    # Draw the mask
+    mask_draw.rectangle([
+        (left_overlap, top_overlap),
+        (right_overlap, bottom_overlap)
+    ], fill=0)
+    return background, mask
+def preview_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
+    background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)
+    # Create a preview image showing the mask
+    preview = background.copy().convert('RGBA')
+    # Create a semi-transparent red overlay
+    red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))  # Reduced alpha to 64 (25% opacity)
+    # Convert black pixels in the mask to semi-transparent red
+    red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
+    red_mask.paste(red_overlay, (0, 0), mask)
+    # Overlay the red mask on the background
+    preview = Image.alpha_composite(preview, red_mask)
+    return preview
+@spaces.GPU(duration=24)
+def infer(image, width, height, overlap_percentage, num_inference_steps, resize_option, custom_resize_percentage, prompt_input, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom):
+    background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, resize_option, custom_resize_percentage, alignment, overlap_left, overlap_right, overlap_top, overlap_bottom)
+    if not can_expand(background.width, background.height, width, height, alignment):
+        alignment = "Middle"
+    cnet_image = background.copy()
+    cnet_image.paste(0, (0, 0), mask)
+    final_prompt = f"{prompt_input} , high quality, 4k"
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = pipe.encode_prompt(final_prompt, "cuda", True)
+    for image in pipe(
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        image=cnet_image,
+        num_inference_steps=num_inference_steps
+    ):
+        yield cnet_image, image
+    image = image.convert("RGBA")
+    cnet_image.paste(image, (0, 0), mask)
+    yield background, cnet_image
+def clear_result():
+    """Clears the result ImageSlider."""
+    return gr.update(value=None)
+def preload_presets(target_ratio, ui_width, ui_height):
+    """Updates the width and height sliders based on the selected aspect ratio."""
+    if target_ratio == "9:16":
+        changed_width = 720
+        changed_height = 1280
+        return changed_width, changed_height, gr.update()
+    elif target_ratio == "16:9":
+        changed_width = 1280
+        changed_height = 720
+        return changed_width, changed_height, gr.update()
+    elif target_ratio == "1:1":
+        changed_width = 1024
+        changed_height = 1024
+        return changed_width, changed_height, gr.update()
+    elif target_ratio == "Custom":
+        return ui_width, ui_height, gr.update(open=True)
+def select_the_right_preset(user_width, user_height):
+    if user_width == 720 and user_height == 1280:
+        return "9:16"
+    elif user_width == 1280 and user_height == 720:
+        return "16:9"
+    elif user_width == 1024 and user_height == 1024:
+        return "1:1"
+    else:
+        return "Custom"
+def toggle_custom_resize_slider(resize_option):
+    return gr.update(visible=(resize_option == "Custom"))
+def update_history(new_image, history):
+    """Updates the history gallery with the new image."""
+    if history is None:
+        history = []
+    history.insert(0, new_image)
+    return history
+css = """
+.gradio-container {
+    width: 1200px !important;
+}
 """
+title = """<h1 align="center">Diffusers Image Outpaint Lightning</h1>
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column():
+        gr.HTML(title)
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(
+                    type="pil",
+                    label="Input Image"
+                )
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        prompt_input = gr.Textbox(label="Prompt (Optional)")
+                    with gr.Column(scale=1):
+                        run_button = gr.Button("Generate")
+                with gr.Row():
+                    target_ratio = gr.Radio(
+                        label="Expected Ratio",
+                        choices=["9:16", "16:9", "1:1", "Custom"],
+                        value="9:16",
+                        scale=2
+                    )
+                    alignment_dropdown = gr.Dropdown(
+                        choices=["Middle", "Left", "Right", "Top", "Bottom"],
+                        value="Middle",
+                        label="Alignment"
+                    )
+                with gr.Accordion(label="Advanced settings", open=False) as settings_panel:
+                    with gr.Column():
+                        with gr.Row():
+                            width_slider = gr.Slider(
+                                label="Target Width",
+                                minimum=720,
+                                maximum=1536,
+                                step=8,
+                                value=720,  # Set a default value
+                            )
+                            height_slider = gr.Slider(
+                                label="Target Height",
+                                minimum=720,
+                                maximum=1536,
+                                step=8,
+                                value=1280,  # Set a default value
+                            )
+                        num_inference_steps = gr.Slider(label="Steps", minimum=4, maximum=12, step=1, value=8)
+                        with gr.Group():
+                            overlap_percentage = gr.Slider(
+                                label="Mask overlap (%)",
+                                minimum=1,
+                                maximum=50,
+                                value=10,
+                                step=1
+                            )
+                            with gr.Row():
+                                overlap_top = gr.Checkbox(label="Overlap Top", value=True)
+                                overlap_right = gr.Checkbox(label="Overlap Right", value=True)
+                            with gr.Row():
+                                overlap_left = gr.Checkbox(label="Overlap Left", value=True)
+                                overlap_bottom = gr.Checkbox(label="Overlap Bottom", value=True)
+                        with gr.Row():
+                            resize_option = gr.Radio(
+                                label="Resize input image",
+                                choices=["Full", "50%", "33%", "25%", "Custom"],
+                                value="Full"
+                            )
+                            custom_resize_percentage = gr.Slider(
+                                label="Custom resize (%)",
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=50,
+                                visible=False
+                            )
+                        with gr.Column():
+                            preview_button = gr.Button("Preview alignment and mask")
+                gr.Examples(
+                    examples=[
+                        ["./examples/example_1.webp", 1280, 720, "Middle"],
+                        ["./examples/example_2.jpg", 1440, 810, "Left"],
+                        ["./examples/example_3.jpg", 1024, 1024, "Top"],
+                        ["./examples/example_3.jpg", 1024, 1024, "Bottom"],
+                    ],
+                    inputs=[input_image, width_slider, height_slider, alignment_dropdown],
+                )
+            with gr.Column():
+                result = ImageSlider(
+                    interactive=False,
+                    label="Generated Image",
+                )
+                use_as_input_button = gr.Button("Use as Input Image", visible=False)
+                history_gallery = gr.Gallery(label="History", columns=6, object_fit="contain", interactive=False)
+                preview_image = gr.Image(label="Preview")
+    def use_output_as_input(output_image):
+        """Sets the generated output as the new input image."""
+        return gr.update(value=output_image[1])
+    use_as_input_button.click(
+        fn=use_output_as_input,
+        inputs=[result],
+        outputs=[input_image]
+    )
+    target_ratio.change(
+        fn=preload_presets,
+        inputs=[target_ratio, width_slider, height_slider],
+        outputs=[width_slider, height_slider, settings_panel],
+        queue=False
+    )
+    width_slider.change(
+        fn=select_the_right_preset,
+        inputs=[width_slider, height_slider],
+        outputs=[target_ratio],
+        queue=False
+    )
+    height_slider.change(
+        fn=select_the_right_preset,
+        inputs=[width_slider, height_slider],
+        outputs=[target_ratio],
+        queue=False
+    )
+    resize_option.change(
+        fn=toggle_custom_resize_slider,
+        inputs=[resize_option],
+        outputs=[custom_resize_percentage],
+        queue=False
+    )
+    run_button.click(  # Clear the result
+        fn=clear_result,
+        inputs=None,
+        outputs=result,
+    ).then(  # Generate the new image
+        fn=infer,
+        inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps,
+                resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
+                overlap_left, overlap_right, overlap_top, overlap_bottom],
+        outputs=result,
+    ).then(  # Update the history gallery
+        fn=lambda x, history: update_history(x[1], history),
+        inputs=[result, history_gallery],
+        outputs=history_gallery,
+    ).then(  # Show the "Use as Input Image" button
+        fn=lambda: gr.update(visible=True),
+        inputs=None,
+        outputs=use_as_input_button,
+    )
+    prompt_input.submit(  # Clear the result
+        fn=clear_result,
+        inputs=None,
+        outputs=result,
+    ).then(  # Generate the new image
+        fn=infer,
+        inputs=[input_image, width_slider, height_slider, overlap_percentage, num_inference_steps,
+                resize_option, custom_resize_percentage, prompt_input, alignment_dropdown,
+                overlap_left, overlap_right, overlap_top, overlap_bottom],
+        outputs=result,
+    ).then(  # Update the history gallery
+        fn=lambda x, history: update_history(x[1], history),
+        inputs=[result, history_gallery],
+        outputs=history_gallery,
+    ).then(  # Show the "Use as Input Image" button
+        fn=lambda: gr.update(visible=True),
+        inputs=None,
+        outputs=use_as_input_button,
+    )
+    preview_button.click(
+        fn=preview_image_and_mask,
+        inputs=[input_image, width_slider, height_slider, overlap_percentage, resize_option, custom_resize_percentage, alignment_dropdown,
+                overlap_left, overlap_right, overlap_top, overlap_bottom],
+        outputs=preview_image,
+        queue=False
+    )
+demo.queue(max_size=12).launch(share=False, show_error=True)