MISHANM/deepseek-ai_janus-Pro-7B-fp16
The MISHANM/deepseek-ai_janus-Pro-7B-fp16 model is the multimodal understanding and image generation model . It is designed to generate Image to text and high-quality images from textual prompts.
Model Details
- Language: English
- Tasks: Imgae to Text & Text to Image Generation
Model Example output
This is the model inference output:
How to Get Started with the Model
git clone https://github.com/deepseek-ai/Janus.git
cd Janus
pip install -e .
Use the code below to get started with the model.
Multimodal Understanding(Image to Text).
Using Gradio
import gradio as gr
import torch
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
from janus.utils.io import load_pil_images
import base64
from io import BytesIO
def pil_image_to_base64(pil_image):
buffered = BytesIO()
pil_image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
return f"data:image/png;base64,{img_str}"
# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
def multimodal_understanding(image, question):
# Convert PIL Image to base64 string
image_base64 = pil_image_to_base64(image)
# Prepare the conversation
conversation = [
{
"role": "<|User|>",
"content": f"<image_placeholder>\n{question}",
"images": [image_base64],
},
{"role": "<|Assistant|>", "content": ""},
]
# Load images and prepare inputs
pil_images = load_pil_images(conversation)
prepare_inputs = vl_chat_processor(
conversations=conversation, images=pil_images, force_batchify=True
).to(vl_gpt.device)
# Run image encoder to get the image embeddings
inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
# Run the model to get the response
outputs = vl_gpt.language_model.generate(
inputs_embeds=inputs_embeds,
attention_mask=prepare_inputs.attention_mask,
pad_token_id=tokenizer.eos_token_id,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
max_new_tokens=512,
do_sample=False,
use_cache=True,
)
answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
return answer
# Gradio interface
interface = gr.Interface(
fn=multimodal_understanding,
inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, placeholder="Enter your question here...")],
outputs="text",
title="Multimodal Understanding ",
description="Upload an image and ask a question about it."
)
interface.launch(share=True)
Text to Image Generation.
import os
import gradio as gr
import PIL.Image
import torch
import numpy as np
from transformers import AutoModelForCausalLM
from janus.models import MultiModalityCausalLM, VLChatProcessor
# Initialize the processor and model
model_path = "MISHANM/deepseek-ai_janus-Pro-7B-fp16"
vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
tokenizer = vl_chat_processor.tokenizer
vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True
)
vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
@torch.inference_mode()
def generate_image(prompt_text, parallel_size=1):
# Prepare the conversation
conversation = [
{
"role": "<|User|>",
"content": prompt_text,
},
{"role": "<|Assistant|>", "content": ""},
]
sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
conversations=conversation,
sft_format=vl_chat_processor.sft_format,
system_prompt="",
)
prompt = sft_format + vl_chat_processor.image_start_tag
input_ids = vl_chat_processor.tokenizer.encode(prompt)
input_ids = torch.LongTensor(input_ids)
tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int).cuda()
for i in range(parallel_size*2):
tokens[i, :] = input_ids
if i % 2 != 0:
tokens[i, 1:-1] = vl_chat_processor.pad_id
inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
image_token_num_per_image = 576
img_size = 384
patch_size = 16
generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).cuda()
for i in range(image_token_num_per_image):
outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=outputs.past_key_values if i != 0 else None)
hidden_states = outputs.last_hidden_state
logits = vl_gpt.gen_head(hidden_states[:, -1, :])
logit_cond = logits[0::2, :]
logit_uncond = logits[1::2, :]
cfg_weight = 5
logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
generated_tokens[:, i] = next_token.squeeze(dim=-1)
next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
inputs_embeds = img_embeds.unsqueeze(dim=1)
dec = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int), shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size])
dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
dec = np.clip((dec + 1) / 2 * 255, 0, 255)
visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
visual_img[:, :, :] = dec
return PIL.Image.fromarray(visual_img[0])
# Create Gradio interface
interface = gr.Interface(
fn=generate_image,
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
outputs="image",
title="Text-to-Image Generation",
description="Enter a text prompt to generate an image."
)
interface.launch(share=True)
Uses
Direct Use
The model is intended for generating image to text & text to image from textual descriptions. It can be used in creative applications, content generation, and artistic exploration.
Out-of-Scope Use
The model is not suitable for generating images with explicit or harmful content. It may not perform well with highly abstract or nonsensical prompts.
Bias, Risks, and Limitations
The model may reflect biases present in the training data. It may generate stereotypical or biased images based on the input prompts.
Recommendations
Users should be aware of potential biases and limitations. It is recommended to review generated content for appropriateness and accuracy.
Citation Information
@misc{MISHANM/deepseek-ai_janus-Pro-7B-fp16,
author = {Mishan Maurya},
title = {Introducing Image to Text & Text to Image Generation model},
year = {2025},
publisher = {Hugging Face},
journal = {Hugging Face repository},
}
- Downloads last month
- 12
Model tree for MISHANM/deepseek-ai_janus-Pro-7B-fp16
Base model
deepseek-ai/Janus-Pro-7B