File size: 3,117 Bytes
8f558df
 
21fcfe6
8f558df
 
2c8fc65
21fcfe6
2c8fc65
 
 
 
 
 
21fcfe6
8f558df
21fcfe6
 
 
 
2c8fc65
 
 
 
 
 
 
21fcfe6
2c8fc65
 
 
 
21fcfe6
2c8fc65
8f558df
21fcfe6
 
2c8fc65
 
 
 
 
8f558df
2c8fc65
 
 
8f558df
 
2c8fc65
8f558df
190ad42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c8fc65
190ad42
 
 
 
 
 
2c8fc65
190ad42
 
 
 
 
 
 
 
2c8fc65
190ad42
8f558df
 
 
2c8fc65
8f558df
190ad42
2c8fc65
 
 
21fcfe6
2c8fc65
 
 
 
 
8f558df
190ad42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image

# Model and Processor Initialization
models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-3.5-vision-instruct", 
        trust_remote_code=True, 
        torch_dtype="auto", 
        _attn_implementation="flash_attention_2"
    ).cuda().eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}

# Default question
default_question = (
    "You are an image-to-prompt converter. Your work is to observe each and every detail of the image and "
    "craft a detailed prompt under 100 words in this format: [image content/subject, description of action, state, "
    "and mood], [art form, style], [artist/photographer reference if needed], [additional settings such as camera "
    "and lens settings, lighting, colors, effects, texture, background, rendering]."
)

# Function to generate prompt
def generate_caption(image):
    model = models["microsoft/Phi-3.5-vision-instruct"]
    processor = processors["microsoft/Phi-3.5-vision-instruct"]

    prompt = f"<|user|>\n<|image_1|>\n{default_question}<|end|>\n<|assistant|>\n"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        eos_token_id=processor.tokenizer.eos_token_id,
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return response

# Enhanced CSS for streamlined UI
css = """
  #container {
    background-color: #f9f9f9;
    padding: 20px;
    border-radius: 15px;
    border: 2px solid #333;
    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
    max-width: 450px;
    margin: auto;
  }
  #input_image {
    margin-top: 15px;
    border: 2px solid #333;
    border-radius: 8px;
    height: 180px;
    object-fit: contain;
  }
  #output_caption {
    margin-top: 15px;
    border: 2px solid #333;
    border-radius: 8px;
    height: 180px;
    overflow-y: auto;
  }
  #run_button {
    background-color: #fff;
    color: black;
    border-radius: 10px;
    padding: 10px;
    cursor: pointer;
    transition: background-color 0.3s ease;
    margin-top: 15px;
  }
  #run_button:hover {
    background-color: #333;
  }
"""

# Gradio Interface with Adjustments
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="container"):
        input_image = gr.Image(type="pil", elem_id="input_image", label="Upload Image")
        run_button = gr.Button(value="Generate Prompt", elem_id="run_button")
        output_caption = gr.Textbox(label="Generated Prompt", show_copy_button=True, elem_id="output_caption", lines=6)

    run_button.click(
        fn=generate_caption,
        inputs=[input_image],
        outputs=output_caption,
    )

demo.launch(share=False)