Spaces:

prithivMLmods
/

core-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 10

Commit

9183b07

verified ·

1 Parent(s): 66a0bd6

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -7

app.py CHANGED Viewed

@@ -23,6 +23,11 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
@@ -45,6 +50,7 @@ h1 {
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -116,8 +122,6 @@ if USE_TORCH_COMPILE:
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
-MAX_SEED = np.iinfo(np.int32).max
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
@@ -178,6 +182,57 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -189,16 +244,41 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for multimodal input, TTS, and image generation.
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
-        # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
@@ -214,10 +294,12 @@ def generate(
             use_resolution_binning=True,
             num_images=1,
         )
-        # Yield the generated image so that the chat interface displays it.
         yield gr.Image(image_paths[0])
         return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -263,7 +345,6 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -313,7 +394,7 @@ demo = gr.ChatInterface(
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
     ],
     cache_examples=False,
     type="messages",

 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
+# Additional imports for 3D model generation
+import tempfile
+import trimesh
+from diffusers import ShapEImg2ImgPipeline, ShapEPipeline
+from diffusers.utils import export_to_ply
 DESCRIPTION = """
 # QwQ Edge 💬
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+MAX_SEED = np.iinfo(np.int32).max
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 if ENABLE_CPU_OFFLOAD:
     sd_pipe.enable_model_cpu_offload()
 def save_image(img: Image.Image) -> str:
     """Save a PIL image with a unique filename and return the path."""
     unique_name = str(uuid.uuid4()) + ".png"
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
+# ============================================================
+# 3D Model Generation using ShapE (Text-to-3D / Image-to-3D)
+# ============================================================
+class Model3D:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.pipe = ShapEPipeline.from_pretrained("openai/shap-e", torch_dtype=torch.float16)
+        self.pipe.to(self.device)
+        self.pipe_img = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img", torch_dtype=torch.float16)
+        self.pipe_img.to(self.device)
+    def to_glb(self, ply_path: str) -> str:
+        mesh = trimesh.load(ply_path)
+        rot = trimesh.transformations.rotation_matrix(-np.pi / 2, [1, 0, 0])
+        mesh = mesh.apply_transform(rot)
+        rot = trimesh.transformations.rotation_matrix(np.pi, [0, 1, 0])
+        mesh = mesh.apply_transform(rot)
+        mesh_path = tempfile.NamedTemporaryFile(suffix=".glb", delete=False)
+        mesh.export(mesh_path.name, file_type="glb")
+        return mesh_path.name
+    def run_text(self, prompt: str, seed: int = 0, guidance_scale: float = 15.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        images = self.pipe(
+            prompt,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_steps,
+            output_type="mesh",
+        ).images
+        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
+        export_to_ply(images[0], ply_path.name)
+        return self.to_glb(ply_path.name)
+    def run_image(self, image: Image.Image, seed: int = 0, guidance_scale: float = 3.0, num_steps: int = 64) -> str:
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        images = self.pipe_img(
+            image,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_steps,
+            output_type="mesh",
+        ).images
+        ply_path = tempfile.NamedTemporaryFile(suffix=".ply", delete=False, mode="w+b")
+        export_to_ply(images[0], ply_path.name)
+        return self.to_glb(ply_path.name)
+# Create a global instance of the 3D model generator.
+model_3d = Model3D()
 @spaces.GPU
 def generate(
     input_dict: dict,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, TTS, image generation,
+    and 3D model generation.
     Special commands:
       - "@tts1" or "@tts2": triggers text-to-speech.
       - "@image": triggers image generation using the SDXL pipeline.
+      - "@3d": triggers 3D model generation using the ShapE pipeline.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # ------------------------------
+    # 3D Model Generation Command
+    # ------------------------------
+    if text.strip().lower().startswith("@3d"):
+        # Remove the "@3d" tag and use the remaining text as the prompt.
+        text = text[len("@3d"):].strip()
+        yield "Generating 3D model..."
+        seed = random.randint(0, MAX_SEED)
+        if files:
+            # If an image is provided, use image-to-3D.
+            image = load_image(files[0])
+            glb_file = model_3d.run_image(image, seed=seed)
+        else:
+            # Otherwise, generate a 3D model from the text prompt.
+            glb_file = model_3d.run_text(text, seed=seed)
+        # Yield the generated GLB file as a downloadable file.
+        yield gr.File(glb_file)
+        return
+    # ------------------------------
+    # Image Generation Command
+    # ------------------------------
     if text.strip().lower().startswith("@image"):
+        # Remove the "@image" tag and use the rest as prompt.
         prompt = text[len("@image"):].strip()
         yield "Generating image..."
         image_paths, used_seed = generate_image_fn(
             use_resolution_binning=True,
             num_images=1,
         )
         yield gr.Image(image_paths[0])
         return  # Exit early
+    # ------------------------------
+    # TTS / Regular Text Generation
+    # ------------------------------
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         ["@image Chocolate dripping from a donut against a yellow background, in the style of brocore, hyper-realistic"],
         ["Write a Python function to check if a number is prime."],
         ["@tts2 What causes rainbows to form?"],
+        ["@3d A futuristic spaceship in low-poly style"],
     ],
     cache_examples=False,
     type="messages",