File size: 3,244 Bytes
8f558df
352c3f8
b178c1a
21fcfe6
8f558df
352c3f8
 
8f558df
21fcfe6
b178c1a
21fcfe6
ebb3eb4
21fcfe6
b178c1a
21fcfe6
 
ebb3eb4
1f684a1
b178c1a
 
 
 
 
 
 
 
 
352c3f8
 
b178c1a
 
 
27d875e
8f558df
21fcfe6
 
b178c1a
 
 
 
8f558df
b178c1a
 
 
8f558df
 
 
987a1de
 
 
 
190ad42
987a1de
 
 
 
 
 
 
8f558df
 
 
 
1f684a1
b178c1a
 
 
987a1de
b178c1a
 
 
 
 
21fcfe6
b178c1a
8f558df
352c3f8
b178c1a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}

DESCRIPTION = "[Phi-3.5-vision Demo](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)"

kwargs = {}
kwargs['torch_dtype'] = torch.bfloat16

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

default_question = "You are an image to prompt converter. Your work is to observe each and every detail of the image and craft a detailed prompt under 100 words in this format: [image content/subject, description of action, state, and mood], [art form, style], [artist/photographer reference if needed], [additional settings such as camera and lens settings, lighting, colors, effects, texture, background, rendering]."

@spaces.GPU
def run_example(image, text_input=default_question, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(**inputs, 
                                max_new_tokens=1000,
                                eos_token_id=processor.tokenizer.eos_token_id,
                                )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
                                    skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    return response

css = """
 #output_text {
    height: 500px;
    overflow: auto;
    border: 1px solid #333;
  }
  #model_selector, #text_input {
    display: none !important;
  }
  #main_container {
    border: 2px solid black;
    padding: 20px;
    border-radius: 10px;
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Phi-3.5 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture", interactive=True)
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct", visible=False)
                text_input = gr.Textbox(label="Question", value=default_question, visible=False)
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

demo.queue(api_open=False)
demo.launch(debug=True, show_api=False)