Dolphin-Inference-MGPU

Build error

File size: 2,660 Bytes

789acc7
 
fd950ef
66011b0
789acc7
fd950ef
 
 
 
 
 
 
323d186
 
 
85baff2
c33e052
323d186
85baff2
323d186
 
 
 
 
 
 
85baff2
323d186
 
 
 
85baff2
323d186
c33e052
323d186
c33e052
323d186
c33e052
 
 
 
 
 
 
85baff2
c33e052
 
85baff2
323d186
 
c33e052
 
85baff2
323d186
 
 
c33e052
 
323d186
 
 
 
 
85baff2
c33e052
85baff2
323d186
 
 
 
 
 
 
 
 
 
85baff2
323d186
 
 
 
 
85baff2
323d186

import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Update model path to your local path
model_name = 'failspy/kappa-3-phi-abliterated'

# create model and load it to the specified device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

def inference(prompt, image, temperature, beam_size):
    # Phi-3 uses a chat template
    messages = [
        {"role": "user", "content": f"Can you describe this image?\n{prompt}"}
    ]
    
    # Apply chat template and add generation prompt
    inputs = tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt"
    ).to(device)

    # Process the image
    pixel_values = model.prepare_image(image).to(device)

    # Add debug prints
    print(f"Device of model: {next(model.parameters()).device}")
    print(f"Device of inputs: {inputs.input_ids.device}")
    print(f"Device of pixel_values: {pixel_values.device}")

    # generate
    with torch.cuda.amp.autocast():
        output_ids = model.generate(
            inputs.input_ids,
            pixel_values=pixel_values,
            max_new_tokens=1024,
            temperature=temperature,
            num_beams=beam_size,
            use_cache=True
        )[0]

    return tokenizer.decode(output_ids[inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()

with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
            image_input = gr.Image(label="Image", type="pil")
            temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
            beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
            submit_button = gr.Button("Submit")
        with gr.Column():
            output_text = gr.Textbox(label="Output")

    submit_button.click(
        fn=inference, 
        inputs=[prompt_input, image_input, temperature_input, beam_size_input], 
        outputs=output_text
    )

demo.launch(share=True)