File size: 2,384 Bytes
ade70cf
1d51385
d69fd19
d7f29ce
 
2ff3a1c
39ae23a
 
ade70cf
d59f119
 
 
ade70cf
0ac529d
ca16909
beec895
d59f119
 
9c53151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ae9be1
d59f119
 
 
 
 
5ae9be1
 
 
1d51385
5ae9be1
 
 
 
 
2ff3a1c
84d0e49
6172e67
 
 
 
 
d59f119
 
6172e67
 
 
 
 
 
1d51385
6172e67
 
 
 
8dc80bf
6172e67
d59f119
6172e67
84d0e49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces

import io
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model_id = 'J-LAB/Florence_2_B_FluxiAI_Product_Caption'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

DESCRIPTION = "# [Florence-2 Product Describe by Fluxi IA](https://huggingface.co/microsoft/Florence-2-large)"

@spaces.GPU
def run_example(task_prompt, image):
    inputs = processor(text=task_prompt, images=image, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        early_stopping=False,
        do_sample=False,
        num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )
    return parsed_answer

def process_image(image):
    image = Image.fromarray(image)  # Convert NumPy array to PIL Image
    task_prompt = '<PC>'
    results = run_example(task_prompt, image)
    
    # Remove the key and get the text value
    if results and task_prompt in results:
        output_text = results[task_prompt]
    else:
        output_text = ""

    # Convert newline characters to HTML line breaks
    output_text = output_text.replace("\n\n", "<br><br>").replace("\n", "<br>")

    return output_text

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
    padding: 10px;
    background-color: #f9f9f9;
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Florence-2 Image Captioning"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.HTML(label="Output Text", elem_id="output")

        submit_btn.click(process_image, [input_img], [output_text])

demo.launch(debug=True)