MISHANM/deepseek-ai_janus-Pro-7B-fp16

The MISHANM/deepseek-ai_janus-Pro-7B-fp16 model is the multimodal understanding and image generation model . It is designed to generate Image to text and high-quality images from textual prompts.

Model Details

Language: English
Tasks: Imgae to Text & Text to Image Generation

Model Example output

This is the model inference output:

How to Get Started with the Model

git clone https://github.com/deepseek-ai/Janus.git
cd Janus
pip install -e .

Use the code below to get started with the model.

Multimodal Understanding(Image to Text).

Using Gradio

import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import base64
from io import BytesIO

def pil_image_to_base64(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{img_str}"

# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

def multimodal_understanding(image, question):
    # Convert PIL Image to base64 string
    image_base64 = pil_image_to_base64(image)

    # Prepare the conversation
    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": [image_base64], 
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    # Load images and prepare inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # Run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # Run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer

# Gradio interface
interface = gr.Interface(
    fn=multimodal_understanding,
    inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, placeholder="Enter your question here...")],
    outputs="text",
    title="Multimodal Understanding ",
    description="Upload an image and ask a question about it."
)

interface.launch(share=True)

Text to Image Generation.

import os
import gradio as gr
import PIL.Image
import torch
import numpy as np
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor

# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

@torch.inference_mode()
def generate_image(prompt_text, parallel_size=1):
    # Prepare the conversation
    conversation = [
        {
            "role": "<|User|>",
            "content": prompt_text,
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
        conversations=conversation,
        sft_format=vl_chat_processor.sft_format,
        system_prompt="",
    )
    prompt = sft_format + vl_chat_processor.image_start_tag

    input_ids = vl_chat_processor.tokenizer.encode(prompt)
    input_ids = torch.LongTensor(input_ids)

    tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int).cuda()
    for i in range(parallel_size*2):
        tokens[i, :] = input_ids
        if i % 2 != 0:
            tokens[i, 1:-1] = vl_chat_processor.pad_id

    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)

    image_token_num_per_image = 576
    img_size = 384
    patch_size = 16
    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()

    for i in range(image_token_num_per_image):
        outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=outputs.past_key_values if i != 0 else None)
        hidden_states = outputs.last_hidden_state

        logits = vl_gpt.gen_head(hidden_states[:, -1, :])
        logit_cond = logits[0::2, :]
        logit_uncond = logits[1::2, :]

        cfg_weight = 5
        logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
        probs = torch.softmax(logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        generated_tokens[:, i] = next_token.squeeze(dim=-1)

        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
        img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
        inputs_embeds = img_embeds.unsqueeze(dim=1)

    dec = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int), shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size])
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)

    dec = np.clip((dec + 1) / 2 * 255, 0, 255)

    visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
    visual_img[:, :, :] = dec

    return PIL.Image.fromarray(visual_img[0])

# Create Gradio interface
interface = gr.Interface(
    fn=generate_image,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs="image",
    title="Text-to-Image Generation",
    description="Enter a text prompt to generate an image."
)

interface.launch(share=True)

Uses

Direct Use

The model is intended for generating image to text & text to image from textual descriptions. It can be used in creative applications, content generation, and artistic exploration.

Out-of-Scope Use

The model is not suitable for generating images with explicit or harmful content. It may not perform well with highly abstract or nonsensical prompts.

Bias, Risks, and Limitations

The model may reflect biases present in the training data. It may generate stereotypical or biased images based on the input prompts.

Recommendations

Users should be aware of potential biases and limitations. It is recommended to review generated content for appropriateness and accuracy.

Citation Information

@misc{MISHANM/deepseek-ai_janus-Pro-7B-fp16,
  author = {Mishan Maurya},
  title = {Introducing Image to Text & Text to Image Generation model},
  year = {2025},
  publisher = {Hugging Face},
  journal = {Hugging Face repository},
  
}

MISHANM
/

deepseek-ai_janus-Pro-7B-fp16

You need to agree to share your contact information to access this model