You need to agree to share your contact information to access this model

This repository is publicly accessible, but you have to accept the conditions to access its files and content.

Log in or Sign Up to review the conditions and access this model content.

MISHANM/deepseek-ai_janus-Pro-7B-fp16

The MISHANM/deepseek-ai_janus-Pro-7B-fp16 model is the multimodal understanding and image generation model . It is designed to generate Image to text and high-quality images from textual prompts.

Model Details

  1. Language: English
  2. Tasks: Imgae to Text & Text to Image Generation

Model Example output

This is the model inference output:

image/png

image/png

How to Get Started with the Model

git clone https://github.com/deepseek-ai/Janus.git
cd Janus
pip install -e .

Use the code below to get started with the model.

Multimodal Understanding(Image to Text).

Using Gradio

import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import base64
from io import BytesIO

def pil_image_to_base64(pil_image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{img_str}"

# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

def multimodal_understanding(image, question):
    # Convert PIL Image to base64 string
    image_base64 = pil_image_to_base64(image)

    # Prepare the conversation
    conversation = [
        {
            "role": "<|User|>",
            "content": f"<image_placeholder>\n{question}",
            "images": [image_base64], 
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    # Load images and prepare inputs
    pil_images = load_pil_images(conversation)
    prepare_inputs = vl_chat_processor(
        conversations=conversation, images=pil_images, force_batchify=True
    ).to(vl_gpt.device)

    # Run image encoder to get the image embeddings
    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)

    # Run the model to get the response
    outputs = vl_gpt.language_model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=prepare_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=512,
        do_sample=False,
        use_cache=True,
    )

    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
    return answer

# Gradio interface
interface = gr.Interface(
    fn=multimodal_understanding,
    inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, placeholder="Enter your question here...")],
    outputs="text",
    title="Multimodal Understanding ",
    description="Upload an image and ask a question about it."
)

interface.launch(share=True)

Text to Image Generation.

import os
import gradio as gr
import PIL.Image
import torch
import numpy as np
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor

# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer

vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()

@torch.inference_mode()
def generate_image(prompt_text, parallel_size=1):
    # Prepare the conversation
    conversation = [
        {
            "role": "<|User|>",
            "content": prompt_text,
        },
        {"role": "<|Assistant|>", "content": ""},
    ]

    sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
        conversations=conversation,
        sft_format=vl_chat_processor.sft_format,
        system_prompt="",
    )
    prompt = sft_format + vl_chat_processor.image_start_tag

    input_ids = vl_chat_processor.tokenizer.encode(prompt)
    input_ids = torch.LongTensor(input_ids)

    tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int).cuda()
    for i in range(parallel_size*2):
        tokens[i, :] = input_ids
        if i % 2 != 0:
            tokens[i, 1:-1] = vl_chat_processor.pad_id

    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)

    image_token_num_per_image = 576
    img_size = 384
    patch_size = 16
    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()

    for i in range(image_token_num_per_image):
        outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=outputs.past_key_values if i != 0 else None)
        hidden_states = outputs.last_hidden_state

        logits = vl_gpt.gen_head(hidden_states[:, -1, :])
        logit_cond = logits[0::2, :]
        logit_uncond = logits[1::2, :]

        cfg_weight = 5
        logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
        probs = torch.softmax(logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        generated_tokens[:, i] = next_token.squeeze(dim=-1)

        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
        img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
        inputs_embeds = img_embeds.unsqueeze(dim=1)

    dec = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int), shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size])
    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)

    dec = np.clip((dec + 1) / 2 * 255, 0, 255)

    visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
    visual_img[:, :, :] = dec

    return PIL.Image.fromarray(visual_img[0])

# Create Gradio interface
interface = gr.Interface(
    fn=generate_image,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    outputs="image",
    title="Text-to-Image Generation",
    description="Enter a text prompt to generate an image."
)

interface.launch(share=True)

 

Uses

Direct Use

The model is intended for generating image to text & text to image from textual descriptions. It can be used in creative applications, content generation, and artistic exploration.

Out-of-Scope Use

The model is not suitable for generating images with explicit or harmful content. It may not perform well with highly abstract or nonsensical prompts.

Bias, Risks, and Limitations

The model may reflect biases present in the training data. It may generate stereotypical or biased images based on the input prompts.

Recommendations

Users should be aware of potential biases and limitations. It is recommended to review generated content for appropriateness and accuracy.

Citation Information

@misc{MISHANM/deepseek-ai_janus-Pro-7B-fp16,
  author = {Mishan Maurya},
  title = {Introducing Image to Text & Text to Image Generation model},
  year = {2025},
  publisher = {Hugging Face},
  journal = {Hugging Face repository},
  
}
Downloads last month
12
Safetensors
Model size
7.42B params
Tensor type
FP16
·
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and HF Inference API was unable to determine this model's library.

Model tree for MISHANM/deepseek-ai_janus-Pro-7B-fp16

Finetuned
(9)
this model