Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -15,34 +15,13 @@ import uuid
|
|
15 |
import subprocess
|
16 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
model = AutoModelForCausalLM.from_pretrained(
|
21 |
-
"microsoft/Florence-2-large-ft",
|
22 |
-
trust_remote_code=True
|
23 |
-
).to(device).eval()
|
24 |
-
processor = AutoProcessor.from_pretrained(
|
25 |
-
"microsoft/Florence-2-large-ft",
|
26 |
-
trust_remote_code=True
|
27 |
-
)
|
28 |
-
return model, processor, device
|
29 |
-
|
30 |
-
model = None
|
31 |
-
processor = None
|
32 |
-
device = None
|
33 |
|
34 |
@spaces.GPU
|
35 |
-
def initialize_model():
|
36 |
-
global model, processor, device
|
37 |
-
model, processor, device = load_model()
|
38 |
-
|
39 |
def run_example(task_prompt, image, text_input=None):
|
40 |
-
global model, processor, device
|
41 |
-
if model is None or processor is None:
|
42 |
-
initialize_model()
|
43 |
-
|
44 |
prompt = task_prompt if text_input is None else task_prompt + text_input
|
45 |
-
inputs = processor(text=prompt, images=image, return_tensors="pt").to(
|
46 |
with torch.inference_mode():
|
47 |
generated_ids = model.generate(**inputs, max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3)
|
48 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
@@ -60,20 +39,17 @@ def fig_to_pil(fig):
|
|
60 |
def plot_bbox_img(image, data):
|
61 |
fig, ax = plt.subplots(figsize=(10, 10))
|
62 |
ax.imshow(image)
|
63 |
-
|
64 |
if 'bboxes' in data and 'labels' in data:
|
65 |
bboxes, labels = data['bboxes'], data['labels']
|
66 |
elif 'bboxes' in data and 'bboxes_labels' in data:
|
67 |
bboxes, labels = data['bboxes'], data['bboxes_labels']
|
68 |
else:
|
69 |
return fig_to_pil(fig)
|
70 |
-
|
71 |
for bbox, label in zip(bboxes, labels):
|
72 |
x1, y1, x2, y2 = bbox
|
73 |
rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor='indigo', facecolor='none')
|
74 |
ax.add_patch(rect)
|
75 |
plt.text(x1, y1, label, color='white', fontsize=10, bbox=dict(facecolor='indigo', alpha=0.8))
|
76 |
-
|
77 |
ax.axis('off')
|
78 |
return fig_to_pil(fig)
|
79 |
|
@@ -199,6 +175,7 @@ def draw_vid_polygons(image, prediction, get_color):
|
|
199 |
draw.text(polygon[0], label, fill="white")
|
200 |
return np.array(img_draw)
|
201 |
|
|
|
202 |
def process_image(image, task, text):
|
203 |
task_mapping = {
|
204 |
"Caption": ("<CAPTION>", lambda result: (result['<CAPTION>'], image)),
|
@@ -242,10 +219,6 @@ def process_video_p(input_video, task, text_input):
|
|
242 |
return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
|
243 |
return result, result, str(frame_results)
|
244 |
|
245 |
-
@spaces.GPU
|
246 |
-
def process_image_with_gpu(image, task, text):
|
247 |
-
return process_image(image, task, text)
|
248 |
-
|
249 |
with gr.Blocks() as demo:
|
250 |
gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
|
251 |
|
@@ -317,16 +290,7 @@ with gr.Blocks() as demo:
|
|
317 |
|
318 |
video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
|
319 |
|
320 |
-
submit_btn.click(
|
321 |
-
|
322 |
-
inputs=[input_img, task_dropdown, text_input],
|
323 |
-
outputs=[output_text, output_image]
|
324 |
-
)
|
325 |
-
|
326 |
-
video_submit_btn.click(
|
327 |
-
fn=process_video_p,
|
328 |
-
inputs=[input_video, video_task_dropdown, video_text_input],
|
329 |
-
outputs=[output_video, output_video, frame_results_output]
|
330 |
-
)
|
331 |
|
332 |
demo.launch()
|
|
|
15 |
import subprocess
|
16 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
17 |
|
18 |
+
model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True).to("cuda").eval()
|
19 |
+
processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
@spaces.GPU
|
|
|
|
|
|
|
|
|
22 |
def run_example(task_prompt, image, text_input=None):
|
|
|
|
|
|
|
|
|
23 |
prompt = task_prompt if text_input is None else task_prompt + text_input
|
24 |
+
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
|
25 |
with torch.inference_mode():
|
26 |
generated_ids = model.generate(**inputs, max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3)
|
27 |
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
|
|
|
39 |
def plot_bbox_img(image, data):
|
40 |
fig, ax = plt.subplots(figsize=(10, 10))
|
41 |
ax.imshow(image)
|
|
|
42 |
if 'bboxes' in data and 'labels' in data:
|
43 |
bboxes, labels = data['bboxes'], data['labels']
|
44 |
elif 'bboxes' in data and 'bboxes_labels' in data:
|
45 |
bboxes, labels = data['bboxes'], data['bboxes_labels']
|
46 |
else:
|
47 |
return fig_to_pil(fig)
|
|
|
48 |
for bbox, label in zip(bboxes, labels):
|
49 |
x1, y1, x2, y2 = bbox
|
50 |
rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor='indigo', facecolor='none')
|
51 |
ax.add_patch(rect)
|
52 |
plt.text(x1, y1, label, color='white', fontsize=10, bbox=dict(facecolor='indigo', alpha=0.8))
|
|
|
53 |
ax.axis('off')
|
54 |
return fig_to_pil(fig)
|
55 |
|
|
|
175 |
draw.text(polygon[0], label, fill="white")
|
176 |
return np.array(img_draw)
|
177 |
|
178 |
+
@spaces.GPU
|
179 |
def process_image(image, task, text):
|
180 |
task_mapping = {
|
181 |
"Caption": ("<CAPTION>", lambda result: (result['<CAPTION>'], image)),
|
|
|
219 |
return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
|
220 |
return result, result, str(frame_results)
|
221 |
|
|
|
|
|
|
|
|
|
222 |
with gr.Blocks() as demo:
|
223 |
gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
|
224 |
|
|
|
290 |
|
291 |
video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
|
292 |
|
293 |
+
submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])
|
294 |
+
video_submit_btn.click(fn=process_video_p, inputs=[input_video, video_task_dropdown, video_text_input], outputs=[output_video, output_video, frame_results_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
demo.launch()
|