Spaces:

Ketengan-Diffusion-Lab
/

Dolphin-Inference

Build error

File size: 2,218 Bytes

789acc7
 
fd950ef
 
789acc7
fd950ef
 
 
 
 
 
 
4f9f0e6
 
fd950ef
 
 
4f9f0e6
fd950ef
 
 
4f9f0e6
 
 
 
fd950ef
 
4f9f0e6
 
fd950ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f9f0e6
f4d3338
4f9f0e6
 
f4d3338
4f9f0e6
f4d3338
 
 
 
fd950ef
 
789acc7
 
5ee7893
fd950ef
 
 
 
 
 
 
 
789acc7
fd950ef

import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

# set device to a specific GPU (e.g., GPU 0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = 'cognitivecomputations/dolphin-vision-7b'

# create model and load it to the specified device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    # device_map='auto',  # Remove auto device mapping
    trust_remote_code=True
).to(device) # Load the model to the specified device

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

def inference(prompt, image):
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)


    image_tensor = model.process_images([image], model.config).to(dtype=model.dtype, device=device)

    # Generate with autocast for mixed precision on the specified GPU
    with torch.cuda.amp.autocast():
        output_ids = model.generate(
            input_ids.to(device), 
            images=image_tensor,
            max_new_tokens=2048,
            use_cache=True
        )[0]

    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
            image_input = gr.Image(label="Image", type="pil")
            submit_button = gr.Button("Submit")
        with gr.Column():
            output_text = gr.Textbox(label="Output")

    submit_button.click(fn=inference, inputs=[prompt_input, image_input], outputs=output_text)

demo.launch()