Spaces:

prithivMLmods
/

core-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 8

Commit

a01646a

verified ·

1 Parent(s): c9c7955

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -17

app.py CHANGED Viewed

@@ -1,14 +1,31 @@
 import os
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
 import edge_tts
-import asyncio
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 from transformers.image_utils import load_image
-import time
 DESCRIPTION = """
 # QwQ Edge 💬
@@ -44,6 +61,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
@@ -75,6 +93,93 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -86,20 +191,41 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input and TTS.
-    If the query starts with an @tts command (e.g. "@tts1"), previous chat history is cleared.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Process image files if provided
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -107,16 +233,25 @@ def generate(
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear any previous chat history to avoid concatenation issues
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
-    if images:
-        # Multimodal branch using the OCR model
         messages = [{
             "role": "user",
             "content": [
@@ -139,7 +274,9 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
-        # Text-only branch using the text model
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -168,10 +305,15 @@ def generate(
         final_response = "".join(outputs)
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
@@ -188,6 +330,7 @@ demo = gr.ChatInterface(
         ["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",

 import os
+import random
+import uuid
+import json
+import time
+import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
 import torch
+import numpy as np
+from PIL import Image
 import edge_tts
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+    Qwen2VLForConditionalGeneration,
+    AutoProcessor,
+)
 from transformers.image_utils import load_image
+from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# ============================================
+#            CHAT & TTS SETUP
+# ============================================
 DESCRIPTION = """
 # QwQ Edge 💬
 )
 model.eval()
+# TTS voices
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
             cleaned.append(msg)
     return cleaned
+# ============================================
+#            IMAGE GENERATION SETUP
+# ============================================
+# Environment variables and parameters for Stable Diffusion XL
+MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # Use SDXL Model repo path via MODEL_VAL_PATH env var
+MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
+USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "1"))  # For potential batched image generation
+# Load the SDXL pipeline
+sd_pipe = StableDiffusionXLPipeline.from_pretrained(
+    MODEL_ID_SD,
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    use_safetensors=True,
+    add_watermarker=False,
+).to(device)
+sd_pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(sd_pipe.scheduler.config)
+# Optional: compile the model for speedup
+if USE_TORCH_COMPILE:
+    sd_pipe.compile()
+# Optional: offload parts of the model to CPU if needed
+if ENABLE_CPU_OFFLOAD:
+    sd_pipe.enable_model_cpu_offload()
+MAX_SEED = np.iinfo(np.int32).max
+def save_image(img: Image.Image) -> str:
+    """Save a PIL image with a unique filename and return the path."""
+    unique_name = str(uuid.uuid4()) + ".png"
+    img.save(unique_name)
+    return unique_name
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+@spaces.GPU(duration=60, enable_queue=True)
+def generate_image_fn(
+    prompt: str,
+    negative_prompt: str = "",
+    use_negative_prompt: bool = False,
+    seed: int = 1,
+    width: int = 1024,
+    height: int = 1024,
+    guidance_scale: float = 3,
+    num_inference_steps: int = 25,
+    randomize_seed: bool = False,
+    use_resolution_binning: bool = True,
+    num_images: int = 1,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Generate images using the SDXL pipeline."""
+    seed = int(randomize_seed_fn(seed, randomize_seed))
+    generator = torch.Generator(device=device).manual_seed(seed)
+    options = {
+        "prompt": [prompt] * num_images,
+        "negative_prompt": [negative_prompt] * num_images if use_negative_prompt else None,
+        "width": width,
+        "height": height,
+        "guidance_scale": guidance_scale,
+        "num_inference_steps": num_inference_steps,
+        "generator": generator,
+        "output_type": "pil",
+    }
+    if use_resolution_binning:
+        options["use_resolution_binning"] = True
+    images = []
+    for i in range(0, num_images, BATCH_SIZE):
+        batch_options = options.copy()
+        batch_options["prompt"] = options["prompt"][i:i+BATCH_SIZE]
+        if "negative_prompt" in batch_options and batch_options["negative_prompt"] is not None:
+            batch_options["negative_prompt"] = options["negative_prompt"][i:i+BATCH_SIZE]
+        images.extend(sd_pipe(**batch_options).images)
+    image_paths = [save_image(img) for img in images]
+    return image_paths, seed
+# ============================================
+#       MAIN GENERATION FUNCTION (CHAT)
+# ============================================
 @spaces.GPU
 def generate(
     input_dict: dict,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, TTS, and now image generation.
+    If the query starts with:
+      - "@tts1" or "@tts2", it triggers text-to-speech.
+      - "@image", it triggers image generation using the SDXL pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # ----------------------------
+    #  NEW: IMAGE GENERATION BRANCH
+    # ----------------------------
+    if text.strip().lower().startswith("@image"):
+        # Remove the "@image" tag and use the rest as prompt
+        prompt = text[len("@image"):].strip()
+        yield "Generating image..."
+        image_paths, used_seed = generate_image_fn(
+            prompt=prompt,
+            negative_prompt="",
+            use_negative_prompt=False,
+            seed=1,
+            width=1024,
+            height=1024,
+            guidance_scale=3,
+            num_inference_steps=25,
+            randomize_seed=True,
+            use_resolution_binning=True,
+            num_images=1,
+        )
+        # Yield the generated image so that the chat interface displays it.
+        yield gr.Image(image_paths[0])
+        return  # Exit early
+    # ----------------------------
+    #  TTS Branch (if query starts with @tts)
+    # ----------------------------
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear previous chat history for a fresh TTS request.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
+        # Remove any stray @tts tags and build the conversation history.
         text = text.replace(tts_prefix, "").strip()
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
+    # ----------------------------
+    #  Multimodal (image + text) branch
+    # ----------------------------
+    if files:
+        if len(files) > 1:
+            images = [load_image(image) for image in files]
+        elif len(files) == 1:
+            images = [load_image(files[0])]
+        else:
+            images = []
         messages = [{
             "role": "user",
             "content": [
             time.sleep(0.01)
             yield buffer
     else:
+        # ----------------------------
+        #  Text-only branch
+        # ----------------------------
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         final_response = "".join(outputs)
         yield final_response
+        # If TTS was requested, convert the final response to speech.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
+# ============================================
+#             GRADIO DEMO SETUP
+# ============================================
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[
         ["A train travels 60 kilometers per hour. If it travels for 5 hours, how far will it travel in total?"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
+        ["@image A futuristic city skyline at dusk"],
     ],
     cache_examples=False,
     type="messages",