File size: 2,969 Bytes
8f558df
352c3f8
8f558df
21fcfe6
8f558df
352c3f8
15923f1
 
352c3f8
8f558df
15923f1
21fcfe6
352c3f8
21fcfe6
 
 
 
 
15923f1
 
 
1f684a1
15923f1
352c3f8
 
 
 
15923f1
 
 
 
27d875e
8f558df
21fcfe6
 
15923f1
8f558df
15923f1
8f558df
 
15923f1
8f558df
15923f1
 
 
 
8f19cf6
15923f1
 
190ad42
15923f1
 
 
 
8f558df
 
 
15923f1
8f558df
1f684a1
15923f1
 
 
 
 
 
 
 
21fcfe6
15923f1
 
8f558df
15923f1
352c3f8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess

# Install flash-attn with no CUDA build isolation
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load model and processor
models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
}
processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
}

# Default description and prompt
DESCRIPTION = ""
default_question = "You are an image to prompt converter. Your work is to observe each and every detail of the image and craft a detailed prompt under 100 words."

# Gradio function for generating output from image input
@spaces.GPU
def run_example(image, text_input=default_question, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]
    user_prompt = '<|user|>\n'
    assistant_prompt = '<|assistant|>\n'
    prompt_suffix = "<|end|>\n"
    
    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(**inputs, max_new_tokens=1000, eos_token_id=processor.tokenizer.eos_token_id)
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response

# Custom CSS for styling
css = """
  #output_text {
    height: 500px;
    overflow: auto;
    border: 1px solid #333;
  }
  #model_selector, #text_input {
    display: none !important;
  }
  #main_container {
    border: 2px solid black;
    padding: 20px;
    border-radius: 10px;
  }
"""

# Gradio interface with styling and layout improvements
with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Row(id="main_container"):
        with gr.Column():
            input_img = gr.Image(label="Input Image", interactive=True)
            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct", visible=False)
            text_input = gr.Textbox(label="Question", value=default_question, visible=False)
            submit_btn = gr.Button(value="Generate Prompt")
        
        output_text = gr.Textbox(label="Output", id="output_text", interactive=False)

    # Link button action to function
    submit_btn.click(run_example, [input_img, text_input, model_selector], output_text)

# Launch Gradio interface
demo.queue(api_open=False)
demo.launch(debug=True, show_api=False)