File size: 3,417 Bytes
8f558df
352c3f8
8f558df
21fcfe6
8f558df
352c3f8
 
8f558df
21fcfe6
352c3f8
21fcfe6
8f558df
21fcfe6
 
 
 
16e72fa
1f684a1
 
 
352c3f8
27d875e
 
 
 
1f684a1
 
352c3f8
 
 
 
 
27d875e
8f558df
21fcfe6
 
352c3f8
 
 
 
8f558df
352c3f8
 
 
8f558df
 
 
1f684a1
8f19cf6
 
 
 
 
 
 
 
 
 
 
 
 
190ad42
1f684a1
 
8f558df
 
 
 
1f684a1
 
 
 
 
 
 
 
 
 
21fcfe6
1f684a1
8f558df
352c3f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}

DESCRIPTION = " "

kwargs = {}
kwargs['torch_dtype'] = torch.bfloat16

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

default_question = "You are an image to prompt converter. Your work is to observe each and every detail of the image and craft a detailed prompt under 100 words in this format: [image content/subject, description of action, state, and mood], [art form, style], [artist/photographer reference if needed], [additional settings such as camera and lens settings, lighting, colors, effects, texture, background, rendering]."

@spaces.GPU
def run_example(image, text_input=default_question, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(**inputs, 
                                max_new_tokens=1000,
                                eos_token_id=processor.tokenizer.eos_token_id,
                                )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
                                    skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    return response

css = """
  #output {
    margin-top: 15px;
    border: 2px solid #333; /* Darker outline */
    border-radius: 8px;
    height: 180px; /* Fixed height */
    object-fit: contain; /* Ensure image fits within the fixed height */ 
  }

   #input_img {
    margin-top: 15px;
    border: 2px solid #333; /* Darker outline */
    border-radius: 8px;
    height: 180px; /* Fixed height */
    object-fit: contain; /* Ensure image fits within the fixed height */
  }
  #model_selector, #text_input { 
    display: none !important; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Phi-3.5 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct", visible=False)
                text_input = gr.Textbox(label="Question", value=default_question, visible=False)
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

demo.queue(api_open=False)
demo.launch(debug=True, show_api=False)