Janus-Pro-7B

Running on Zero

App Files Files Community

Daemontatox commited on 9 days ago

Commit

8e36800

verified ·

1 Parent(s): ab9b588

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -96

app.py CHANGED Viewed

@@ -4,15 +4,15 @@ from transformers import AutoConfig, AutoModelForCausalLM
 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
 import numpy as np
 import os
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
 # Load model and processor
 model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
@@ -29,22 +29,25 @@ vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
-# SR model
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
 @torch.inference_mode()
-@spaces.GPU(duration=120)
-# Multimodal Understanding function
 def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
-    # set seed
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
     conversation = [
         {
             "role": "<|User|>",
@@ -54,12 +57,12 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
         {"role": "<|Assistant|>", "content": ""},
     ]
-    pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
     ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     outputs = vl_gpt.language_model.generate(
@@ -78,7 +81,9 @@ def multimodal_understanding(image, question, seed, top_p, temperature, progress
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
     return answer
 def generate(input_ids,
              width,
              height,
@@ -88,7 +93,6 @@ def generate(input_ids,
              image_token_num_per_image: int = 576,
              patch_size: int = 16,
              progress=gr.Progress(track_tqdm=True)):
-    # Clear CUDA cache before generating
     torch.cuda.empty_cache()
     tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
@@ -103,8 +107,8 @@ def generate(input_ids,
     for i in range(image_token_num_per_image):
         with torch.no_grad():
             outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
-                                                use_cache=True,
-                                                past_key_values=pkv)
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
@@ -118,35 +122,26 @@ def generate(input_ids,
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
     patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
-                                                 shape=[parallel_size, 8, width // patch_size, height // patch_size])
     return generated_tokens.to(dtype=torch.int), patches
 def unpack(dec, width, height, parallel_size=5):
     dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
     dec = np.clip((dec + 1) / 2 * 255, 0, 255)
     visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
     visual_img[:, :, :] = dec
     return visual_img
 @torch.inference_mode()
-@spaces.GPU(duration=120)  # Specify a duration to avoid timeout
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
                    t2i_temperature=1.0,
                    progress=gr.Progress(track_tqdm=True)):
-    # Clear CUDA cache and avoid tracking gradients
     torch.cuda.empty_cache()
-    # Set the seed for reproducible results
     if seed is not None:
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
@@ -154,13 +149,13 @@ def generate_image(prompt,
     width = 384
     height = 384
     parallel_size = 4
     with torch.no_grad():
         messages = [{'role': '<|User|>', 'content': prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
-                                                                   sft_format=vl_chat_processor.sft_format,
-                                                                   system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
@@ -174,13 +169,11 @@ def generate_image(prompt,
                         width // 16 * 16,
                         height // 16 * 16,
                         parallel_size=parallel_size)
-        # return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
         print(f'upsample time: {time.time() - stime}')
-        return ret_images
 @spaces.GPU(duration=60)
 def image_upsample(img: Image.Image) -> Image.Image:
@@ -188,87 +181,82 @@ def image_upsample(img: Image.Image) -> Image.Image:
         raise Exception("Image not uploaded")
     width, height = img.size
     if width >= 5000 or height >= 5000:
         raise Exception("The image is too large.")
     global sr_model
     result = sr_model.predict(img.convert('RGB'))
     return result
-# Gradio interface
 css = '''
 .gradio-container {max-width: 960px !important}
 '''
-with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Janus Pro 7B")
-    with gr.Tab("Multimodal Understanding"):
-        gr.Markdown(value="## Multimodal Understanding")
-        image_input = gr.Image()
-        with gr.Column():
-            question_input = gr.Textbox(label="Question")
-        understanding_button = gr.Button("Chat")
-        understanding_output = gr.Textbox(label="Response")
-        with gr.Accordion("Advanced options", open=False):
-                und_seed_input = gr.Number(label="Seed", precision=0, value=42)
-                top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
-                temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
-        examples_inpainting = gr.Examples(
-            label="Multimodal Understanding examples",
-            examples=[
-                [
-                    "explain this meme",
-                    "doge.png",
-                ],
-                [
-                    "Convert the formula into latex code.",
-                    "equation.png",
-                ],
-            ],
-            inputs=[question_input, image_input],
-        )
-    with gr.Tab("Text-to-Image Generation"):
-        gr.Markdown(value="## Text-to-Image Generation")
-        prompt_input = gr.Textbox(label="Prompt. (Prompt in more detail can help produce better images!")
-        generation_button = gr.Button("Generate Images")
-        image_output = gr.Gallery(label="Generated Images", columns=4, rows=1)
-        with gr.Accordion("Advanced options", open=False):
-            with gr.Row():
-                cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
-                t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
-            seed_input = gr.Number(label="Seed (Optional)", precision=0, value=1234)
-        examples_t2i = gr.Examples(
-            label="Text to image generation examples.",
-            examples=[
-                "Master shifu racoon wearing drip attire as a street gangster.",
-                "The face of a beautiful girl",
-                "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-                "A cute and adorable baby fox with big brown eyes, autumn leaves in the background enchanting,immortal,fluffy, shiny mane,Petals,fairyism,unreal engine 5 and Octane Render,highly detailed, photorealistic, cinematic, natural colors.",
-                "The image features an intricately designed eye set against a circular backdrop adorned with ornate swirl patterns that evoke both realism and surrealism. At the center of attention is a strikingly vivid blue iris surrounded by delicate veins radiating outward from the pupil to create depth and intensity. The eyelashes are long and dark, casting subtle shadows on the skin around them which appears smooth yet slightly textured as if aged or weathered over time.\n\nAbove the eye, there's a stone-like structure resembling part of classical architecture, adding layers of mystery and timeless elegance to the composition. This architectural element contrasts sharply but harmoniously with the organic curves surrounding it. Below the eye lies another decorative motif reminiscent of baroque artistry, further enhancing the overall sense of eternity encapsulated within each meticulously crafted detail. \n\nOverall, the atmosphere exudes a mysterious aura intertwined seamlessly with elements suggesting timelessness, achieved through the juxtaposition of realistic textures and surreal artistic flourishes. Each component\u2014from the intricate designs framing the eye to the ancient-looking stone piece above\u2014contributes uniquely towards creating a visually captivating tableau imbued with enigmatic allure.",
-            ],
-            inputs=prompt_input,
-        )
-    understanding_button.click(
-        multimodal_understanding,
-        inputs=[image_input, question_input, und_seed_input, top_p, temperature],
-        outputs=understanding_output
     )
-    generation_button.click(
-        fn=generate_image,
-        inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature],
-        outputs=image_output
     )
 demo.launch(share=True)

 from janus.models import MultiModalityCausalLM, VLChatProcessor
 from janus.utils.io import load_pil_images
 from PIL import Image
 import numpy as np
 import os
 import time
 from Upsample import RealESRGAN
 import spaces  # Import spaces for ZeroGPU compatibility
+# ---------------------------
 # Load model and processor
+# ---------------------------
 model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 tokenizer = vl_chat_processor.tokenizer
 cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# SR (Super Resolution) model
 sr_model = RealESRGAN(torch.device('cuda' if torch.cuda.is_available() else 'cpu'), scale=2)
 sr_model.load_weights(f'weights/RealESRGAN_x2.pth', download=False)
+# ---------------------------
+# Multimodal Understanding Function
+# ---------------------------
 @torch.inference_mode()
+@spaces.GPU(duration=120)
 def multimodal_understanding(image, question, seed, top_p, temperature, progress=gr.Progress(track_tqdm=True)):
     # Clear CUDA cache before generating
     torch.cuda.empty_cache()
+    # Set seed for reproducibility
     torch.manual_seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
+    # Prepare conversation – note the use of a placeholder for the image.
     conversation = [
         {
             "role": "<|User|>",
         {"role": "<|Assistant|>", "content": ""},
     ]
+    # The chat processor expects PIL images.
+    pil_images = [Image.fromarray(np.array(image))] if not isinstance(image, Image.Image) else [image]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
     ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     outputs = vl_gpt.language_model.generate(
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
     return answer
+# ---------------------------
+# Image Generation Functions
+# ---------------------------
 def generate(input_ids,
              width,
              height,
              image_token_num_per_image: int = 576,
              patch_size: int = 16,
              progress=gr.Progress(track_tqdm=True)):
     torch.cuda.empty_cache()
     tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
     for i in range(image_token_num_per_image):
         with torch.no_grad():
             outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
+                                                  use_cache=True,
+                                                  past_key_values=pkv)
             pkv = outputs.past_key_values
             hidden_states = outputs.last_hidden_state
             logits = vl_gpt.gen_head(hidden_states[:, -1, :])
             img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
             inputs_embeds = img_embeds.unsqueeze(dim=1)
     patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
+                                                  shape=[parallel_size, 8, width // patch_size, height // patch_size])
     return generated_tokens.to(dtype=torch.int), patches
 def unpack(dec, width, height, parallel_size=5):
     dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
     dec = np.clip((dec + 1) / 2 * 255, 0, 255)
     visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
     visual_img[:, :, :] = dec
     return visual_img
 @torch.inference_mode()
+@spaces.GPU(duration=120)
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
                    t2i_temperature=1.0,
                    progress=gr.Progress(track_tqdm=True)):
     torch.cuda.empty_cache()
     if seed is not None:
         torch.manual_seed(seed)
         torch.cuda.manual_seed(seed)
     width = 384
     height = 384
     parallel_size = 4
     with torch.no_grad():
         messages = [{'role': '<|User|>', 'content': prompt},
                     {'role': '<|Assistant|>', 'content': ''}]
         text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                           sft_format=vl_chat_processor.sft_format,
+                                                                           system_prompt='')
         text = text + vl_chat_processor.image_start_tag
         input_ids = torch.LongTensor(tokenizer.encode(text))
                         width // 16 * 16,
                         height // 16 * 16,
                         parallel_size=parallel_size)
+        # Upsample the generated images
         stime = time.time()
         ret_images = [image_upsample(Image.fromarray(images[i])) for i in range(parallel_size)]
         print(f'upsample time: {time.time() - stime}')
+        return ret_images  # returns a list
 @spaces.GPU(duration=60)
 def image_upsample(img: Image.Image) -> Image.Image:
         raise Exception("Image not uploaded")
     width, height = img.size
     if width >= 5000 or height >= 5000:
         raise Exception("The image is too large.")
     global sr_model
     result = sr_model.predict(img.convert('RGB'))
     return result
+# A helper function to generate a single image (the first result) from a description.
+def generate_single_image(prompt, seed, guidance, t2i_temperature):
+    images = generate_image(prompt, seed, guidance, t2i_temperature)
+    # Return the first image (if available)
+    return images[0] if images else None
+# ---------------------------
+# Chat About Generated Image
+# ---------------------------
+# This function uses the generated image and a chat question.
+def chat_about_image(generated_image, chat_text, seed, top_p, temperature, chat_history):
+    if generated_image is None:
+        return chat_history, "Please generate an image first by entering a description above."
+    response = multimodal_understanding(generated_image, chat_text, seed, top_p, temperature)
+    chat_history.append((chat_text, response))
+    return chat_history, ""
+# ---------------------------
+# Gradio Interface
+# ---------------------------
 css = '''
 .gradio-container {max-width: 960px !important}
 '''
+with gr.Blocks(css=css, title="Janus Pro 7B – Image Generation and Chat") as demo:
+    gr.Markdown("# Janus Pro 7B: Image Generation and Conversation")
+    gr.Markdown("Enter an image description below to have the model generate an image. Once generated, you can chat about the image and ask questions.")
+    # States to store the generated image and the chat history.
+    state_image = gr.State(None)
+    state_history = gr.State([])
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Step 1. Generate an Image from Description")
+            description_input = gr.Textbox(label="Image Description", placeholder="Describe the image you want...")
+            with gr.Accordion("Advanced Generation Options", open=False):
+                gen_seed_input = gr.Number(label="Seed", precision=0, value=42)
+                guidance_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
+                t2i_temperature_input = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="Temperature")
+            generate_button = gr.Button("Generate Image")
+            image_output = gr.Image(label="Generated Image", interactive=False)
+        with gr.Column():
+            gr.Markdown("### Step 2. Chat about the Image")
+            gr.Markdown("Ask questions or discuss the generated image below. (If no image has been generated yet, please do so in Step 1.)")
+            with gr.Accordion("Advanced Chat Options", open=False):
+                chat_seed_input = gr.Number(label="Seed", precision=0, value=42)
+                top_p_input = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+                chat_temperature_input = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="Temperature")
+            chatbox = gr.Chatbot(label="Conversation")
+            chat_input = gr.Textbox(label="Your Message", placeholder="Enter your question or comment here...")
+            send_button = gr.Button("Send")
+    # When the user clicks the "Generate Image" button:
+    generate_button.click(
+        fn=generate_single_image,
+        inputs=[description_input, gen_seed_input, guidance_input, t2i_temperature_input],
+        outputs=image_output
+    ).then(
+        fn=lambda img: img,  # pass through the generated image
+        inputs=image_output,
+        outputs=state_image
     )
+    # When the user sends a chat message, update the conversation.
+    send_button.click(
+        fn=chat_about_image,
+        inputs=[state_image, chat_input, chat_seed_input, top_p_input, chat_temperature_input, state_history],
+        outputs=[chatbox, chat_input],
     )
 demo.launch(share=True)