File size: 2,704 Bytes
8f558df
352c3f8
b178c1a
21fcfe6
8f558df
352c3f8
 
8f558df
21fcfe6
b178c1a
21fcfe6
ebb3eb4
21fcfe6
b178c1a
21fcfe6
 
1f684a1
b178c1a
 
 
 
 
 
 
e3bbb23
b178c1a
352c3f8
 
b178c1a
 
 
27d875e
8f558df
21fcfe6
 
b178c1a
 
 
 
8f558df
b178c1a
 
 
8f558df
 
 
987a1de
 
 
 
190ad42
987a1de
 
 
 
 
 
 
8f558df
 
 
 
5c3571f
b178c1a
987a1de
b178c1a
 
 
 
 
21fcfe6
b178c1a
8f558df
352c3f8
b178c1a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}


kwargs = {}
kwargs['torch_dtype'] = torch.bfloat16

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

default_question = "highly detailed caption"

@spaces.GPU
def run_example(image, text_input=default_question, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(**inputs, 
                                max_new_tokens=1000,
                                eos_token_id=processor.tokenizer.eos_token_id,
                                )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
                                    skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    return response

css = """
 #output_text {
    height: 500px;
    overflow: auto;
    border: 1px solid #333;
  }
  #model_selector, #text_input {
    display: none !important;
  }
  #main_container {
    border: 2px solid black;
    padding: 20px;
    border-radius: 10px;
  }
"""

with gr.Blocks(css=css) as demo:
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture", interactive=True)
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct", visible=False)
                text_input = gr.Textbox(label="Question", value=default_question, visible=False)
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

demo.queue(api_open=False)
demo.launch(debug=True, show_api=False)