Ketengan-Diffusion-Lab's picture
Update app.py
93f8b15 verified
raw
history blame
2.77 kB
import gradio as gr
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
from accelerate import Accelerator, DistributedType
import os
# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')
# Initialize Accelerator
accelerator = Accelerator()
model_name = 'cognitivecomputations/dolphin-vision-72b'
# Determine the number of GPUs available
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
# Prepare model
model = accelerator.prepare(model)
def inference(prompt, image, temperature, beam_size):
messages = [
{"role": "user", "content": f'<image>\n{prompt}'}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
image_tensor = model.process_images([image], model.config)
# Move tensors to the appropriate device
input_ids = input_ids.to(accelerator.device)
image_tensor = image_tensor.to(accelerator.device)
# generate
with torch.cuda.amp.autocast():
output_ids = accelerator.unwrap_model(model).generate(
input_ids,
images=image_tensor,
max_new_tokens=1024,
temperature=temperature,
num_beams=beam_size,
use_cache=True
)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
# Create Gradio interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
prompt_input = gr.Textbox(label="Prompt", placeholder="Describe this image in detail")
image_input = gr.Image(label="Image", type="pil")
temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
beam_size_input = gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Beam Size")
submit_button = gr.Button("Submit")
with gr.Column():
output_text = gr.Textbox(label="Output")
submit_button.click(
fn=inference,
inputs=[prompt_input, image_input, temperature_input, beam_size_input],
outputs=output_text
)
# Launch the app
demo.launch()