File size: 3,347 Bytes
8f558df
352c3f8
8f558df
21fcfe6
8f558df
352c3f8
 
8f558df
21fcfe6
352c3f8
21fcfe6
8f558df
21fcfe6
 
 
 
352c3f8
 
27d875e
 
 
 
352c3f8
 
 
 
 
27d875e
8f558df
21fcfe6
 
352c3f8
 
 
 
8f558df
352c3f8
 
 
8f558df
 
 
27d875e
3901da8
 
 
 
190ad42
3901da8
27d875e
 
3901da8
27d875e
 
 
3901da8
27d875e
 
 
 
 
 
 
 
 
 
 
 
3901da8
27d875e
8f558df
 
 
 
3901da8
27d875e
 
 
 
 
 
3901da8
27d875e
 
 
 
21fcfe6
27d875e
 
8f558df
352c3f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}

default_question = "You are an image to prompt converter. Your work is to observe each and every detail of the image and craft a detailed prompt under 100 words in this format: [image content/subject, description of action, state, and mood], [art form, style], [artist/photographer reference if needed], [additional settings such as camera and lens settings, lighting, colors, effects, texture, background, rendering]."

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

@spaces.GPU
def run_example(image, text_input=default_question, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(**inputs, 
                                max_new_tokens=1000,
                                eos_token_id=processor.tokenizer.eos_token_id,
                                )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
                                    skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)[0]
    return response

css = """
  #container {
    border: 2px solid #333;
    padding: 20px;
    max-width: 400px;
    margin: auto;
  }
  #input_img, #output_text {
    border: 1px solid #444;
    border-radius: 5px;
  }
  #input_img {
    height: 200px;
    overflow: hidden;
  }
  #output_text {
    height: 150px;
    overflow-y: auto;
  }
  .copy-btn {
    display: inline-block;
    padding: 5px 10px;
    font-size: 14px;
    background-color: #333;
    color: #fff;
    border: none;
    border-radius: 3px;
    cursor: pointer;
    margin-top: 10px;
  }
"""

with gr.Blocks(css=css) as demo:
    with gr.Box(elem_id="container"):
        input_img = gr.Image(label="Input Picture", elem_id="input_img")
        text_input = gr.Textbox(value=default_question, visible=False)
        submit_btn = gr.Button(value="Generate")
        output_text = gr.Textbox(label="Output Text", elem_id="output_text")
        
        submit_btn.click(run_example, [input_img, text_input], [output_text])

    def copy_to_clipboard(content):
        import pyperclip
        pyperclip.copy(content)
        return "Text copied!"

    copy_button = gr.Button("Copy Text", elem_id="copy-btn")
    copy_button.click(copy_to_clipboard, inputs=output_text, outputs=None)

demo.queue(api_open=False)
demo.launch(debug=True, show_api=False)