SixOpen commited on
Commit
35fd487
·
verified ·
1 Parent(s): 23686dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -42
app.py CHANGED
@@ -15,34 +15,13 @@ import uuid
15
  import subprocess
16
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
17
 
18
- def load_model():
19
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20
- model = AutoModelForCausalLM.from_pretrained(
21
- "microsoft/Florence-2-large-ft",
22
- trust_remote_code=True
23
- ).to(device).eval()
24
- processor = AutoProcessor.from_pretrained(
25
- "microsoft/Florence-2-large-ft",
26
- trust_remote_code=True
27
- )
28
- return model, processor, device
29
-
30
- model = None
31
- processor = None
32
- device = None
33
 
34
  @spaces.GPU
35
- def initialize_model():
36
- global model, processor, device
37
- model, processor, device = load_model()
38
-
39
  def run_example(task_prompt, image, text_input=None):
40
- global model, processor, device
41
- if model is None or processor is None:
42
- initialize_model()
43
-
44
  prompt = task_prompt if text_input is None else task_prompt + text_input
45
- inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
46
  with torch.inference_mode():
47
  generated_ids = model.generate(**inputs, max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3)
48
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
@@ -60,20 +39,17 @@ def fig_to_pil(fig):
60
  def plot_bbox_img(image, data):
61
  fig, ax = plt.subplots(figsize=(10, 10))
62
  ax.imshow(image)
63
-
64
  if 'bboxes' in data and 'labels' in data:
65
  bboxes, labels = data['bboxes'], data['labels']
66
  elif 'bboxes' in data and 'bboxes_labels' in data:
67
  bboxes, labels = data['bboxes'], data['bboxes_labels']
68
  else:
69
  return fig_to_pil(fig)
70
-
71
  for bbox, label in zip(bboxes, labels):
72
  x1, y1, x2, y2 = bbox
73
  rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor='indigo', facecolor='none')
74
  ax.add_patch(rect)
75
  plt.text(x1, y1, label, color='white', fontsize=10, bbox=dict(facecolor='indigo', alpha=0.8))
76
-
77
  ax.axis('off')
78
  return fig_to_pil(fig)
79
 
@@ -199,6 +175,7 @@ def draw_vid_polygons(image, prediction, get_color):
199
  draw.text(polygon[0], label, fill="white")
200
  return np.array(img_draw)
201
 
 
202
  def process_image(image, task, text):
203
  task_mapping = {
204
  "Caption": ("<CAPTION>", lambda result: (result['<CAPTION>'], image)),
@@ -242,10 +219,6 @@ def process_video_p(input_video, task, text_input):
242
  return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
243
  return result, result, str(frame_results)
244
 
245
- @spaces.GPU
246
- def process_image_with_gpu(image, task, text):
247
- return process_image(image, task, text)
248
-
249
  with gr.Blocks() as demo:
250
  gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
251
 
@@ -317,16 +290,7 @@ with gr.Blocks() as demo:
317
 
318
  video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
319
 
320
- submit_btn.click(
321
- fn=process_image_with_gpu,
322
- inputs=[input_img, task_dropdown, text_input],
323
- outputs=[output_text, output_image]
324
- )
325
-
326
- video_submit_btn.click(
327
- fn=process_video_p,
328
- inputs=[input_video, video_task_dropdown, video_text_input],
329
- outputs=[output_video, output_video, frame_results_output]
330
- )
331
 
332
  demo.launch()
 
15
  import subprocess
16
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
17
 
18
+ model = AutoModelForCausalLM.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True).to("cuda").eval()
19
+ processor = AutoProcessor.from_pretrained('microsoft/Florence-2-large-ft', trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  @spaces.GPU
 
 
 
 
22
  def run_example(task_prompt, image, text_input=None):
 
 
 
 
23
  prompt = task_prompt if text_input is None else task_prompt + text_input
24
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
25
  with torch.inference_mode():
26
  generated_ids = model.generate(**inputs, max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3)
27
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
 
39
  def plot_bbox_img(image, data):
40
  fig, ax = plt.subplots(figsize=(10, 10))
41
  ax.imshow(image)
 
42
  if 'bboxes' in data and 'labels' in data:
43
  bboxes, labels = data['bboxes'], data['labels']
44
  elif 'bboxes' in data and 'bboxes_labels' in data:
45
  bboxes, labels = data['bboxes'], data['bboxes_labels']
46
  else:
47
  return fig_to_pil(fig)
 
48
  for bbox, label in zip(bboxes, labels):
49
  x1, y1, x2, y2 = bbox
50
  rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=2, edgecolor='indigo', facecolor='none')
51
  ax.add_patch(rect)
52
  plt.text(x1, y1, label, color='white', fontsize=10, bbox=dict(facecolor='indigo', alpha=0.8))
 
53
  ax.axis('off')
54
  return fig_to_pil(fig)
55
 
 
175
  draw.text(polygon[0], label, fill="white")
176
  return np.array(img_draw)
177
 
178
+ @spaces.GPU
179
  def process_image(image, task, text):
180
  task_mapping = {
181
  "Caption": ("<CAPTION>", lambda result: (result['<CAPTION>'], image)),
 
219
  return None, "Error: Video processing failed. Check logs above for info.", str(frame_results)
220
  return result, result, str(frame_results)
221
 
 
 
 
 
222
  with gr.Blocks() as demo:
223
  gr.HTML("<h1><center>Microsoft Florence-2-large-ft</center></h1>")
224
 
 
290
 
291
  video_task_dropdown.change(fn=update_video_text_input, inputs=video_task_dropdown, outputs=video_text_input)
292
 
293
+ submit_btn.click(fn=process_image, inputs=[input_img, task_dropdown, text_input], outputs=[output_text, output_image])
294
+ video_submit_btn.click(fn=process_video_p, inputs=[input_video, video_task_dropdown, video_text_input], outputs=[output_video, output_video, frame_results_output])
 
 
 
 
 
 
 
 
 
295
 
296
  demo.launch()