File size: 2,345 Bytes

789acc7
 
fd950ef
66011b0
789acc7
fd950ef
 
 
 
 
 
 
5220358
 
 
fd950ef
f81e89d
fd950ef
66011b0
fd950ef
c65567a
66011b0
c65567a
4f9f0e6
f8d9f18
4f9f0e6
c65567a
 
 
 
fd950ef
 
 
 
 
 
 
 
 
 
 
 
5220358
225c3f2
5220358
fd950ef
225c3f2
 
 
 
fd950ef
cd44f8b
66011b0
5220358
 
 
 
 
 
fd950ef
 
789acc7
 
5ee7893
fd950ef
 
 
 
 
 
 
 
789acc7
f8d9f18

import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = 'cognitivecomputations/dolphin-vision-72b'

# create model and load it to the specified device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",  # This will automatically use the GPU if available
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

def inference(prompt, image):
    messages = [
        {"role": "user", "content": f'<image>\n{prompt}'}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
    input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0).to(device)

    image_tensor = model.process_images([image], model.config).to(device)

    # Add debug prints
    print(f"Device of model: {next(model.parameters()).device}")
    print(f"Device of input_ids: {input_ids.device}")
    print(f"Device of image_tensor: {image_tensor.device}")

    # generate
    with torch.cuda.amp.autocast():
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            max_new_tokens=1024,
            use_cache=True
        )[0]

    return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
            image_input = gr.Image(label="Image", type="pil")
            submit_button = gr.Button("Submit")
        with gr.Column():
            output_text = gr.Textbox(label="Output")

    submit_button.click(fn=inference, inputs=[prompt_input, image_input], outputs=output_text)

demo.launch(share=True)