Spaces:

drlon
/

magma-ui-agent

Runtime error

App Files Files Community

drlon commited on Feb 27

Commit

8b47a07

1 Parent(s): 3e42347

update app.py

Browse files

Files changed (1) hide show

app.py +95 -89

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from typing import Optional
 import spaces
 import gradio as gr
@@ -73,7 +74,7 @@ This demo is powered by [Gradio](https://gradio.app/) and uses OmniParserv2 to g
 DEVICE = torch.device('cuda')
 @spaces.GPU
-@torch.inference_mode()
 def get_som_response(instruction, image_som):
     prompt = magma_som_prompt.format(instruction)
     if magam_model.config.mm_use_image_start_end:
@@ -110,7 +111,7 @@ def get_som_response(instruction, image_som):
     return response
 @spaces.GPU
-@torch.inference_mode()
 def get_qa_response(instruction, image):
     prompt = magma_qa_prompt.format(instruction)
     if magam_model.config.mm_use_image_start_end:
@@ -147,7 +148,7 @@ def get_qa_response(instruction, image):
     return response
 @spaces.GPU
-@torch.inference_mode()
 # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def process(
     image_input,
@@ -158,98 +159,103 @@ def process(
     instruction,
 ) -> Optional[Image.Image]:
-    # image_save_path = 'imgs/saved_image_demo.png'
-    # image_input.save(image_save_path)
-    # image = Image.open(image_save_path)
-    box_overlay_ratio = image_input.size[0] / 3200
-    draw_bbox_config = {
-        'text_scale': 0.8 * box_overlay_ratio,
-        'text_thickness': max(int(2 * box_overlay_ratio), 1),
-        'text_padding': max(int(3 * box_overlay_ratio), 1),
-        'thickness': max(int(3 * box_overlay_ratio), 1),
-    }
-    ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_input, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=use_paddleocr)
-    text, ocr_bbox = ocr_bbox_rslt
-    dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_input, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz,)
-    parsed_content_list = '\n'.join([f'icon {i}: ' + str(v) for i,v in enumerate(parsed_content_list)])
-    if len(instruction) == 0:
-        print('finish processing')
-        image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
-        return image, str(parsed_content_list)
-    elif instruction.startswith('Q:'):
-        response = get_qa_response(instruction, image_input)
-        return image_input, response
-    # parsed_content_list = str(parsed_content_list)
-    # convert xywh to yxhw
-    label_coordinates_yxhw = {}
-    for key, val in label_coordinates.items():
-        if val[2] < 0 or val[3] < 0:
-            continue
-        label_coordinates_yxhw[key] = [val[1], val[0], val[3], val[2]]
-    image_som = plot_boxes_with_marks(image_input.copy(), [val for key, val in label_coordinates_yxhw.items()], som_generator, edgecolor=(255,0,0), fn_save=None, normalized_to_pixel=False)
-    # convert xywh to xyxy
-    for key, val in label_coordinates.items():
-        label_coordinates[key] = [val[0], val[1], val[0] + val[2], val[1] + val[3]]
-    # normalize label_coordinates
-    for key, val in label_coordinates.items():
-        label_coordinates[key] = [val[0] / image_input.size[0], val[1] / image_input.size[1], val[2] / image_input.size[0], val[3] / image_input.size[1]]
-    magma_response = get_som_response(instruction, image_som)
-    print("magma repsonse: ", magma_response)
-    # map magma_response into the mark id
-    mark_id = extract_mark_id(magma_response)
-    if mark_id is not None:
-        if str(mark_id) in label_coordinates:
-            bbox_for_mark = label_coordinates[str(mark_id)]
         else:
             bbox_for_mark = None
-    else:
-        bbox_for_mark = None
-    if bbox_for_mark:
-        # draw bbox_for_mark on the image
-        image_som = plot_boxes_with_marks(
-            image_input,
-            [label_coordinates_yxhw[str(mark_id)]],
-            som_generator,
-            edgecolor=(255,127,111),
-            alpha=30,
-            fn_save=None,
-            normalized_to_pixel=False,
-            add_mark=False
-        )
-    else:
-        try:
-            if 'box' in magma_response:
-                pred_bbox = extract_bbox(magma_response)
-                click_point = [(pred_bbox[0][0] + pred_bbox[1][0]) / 2, (pred_bbox[0][1] + pred_bbox[1][1]) / 2]
-                click_point = [item / 1000 for item in click_point]
-            else:
-                click_point = pred_2_point(magma_response)
-            # de-normalize click_point (width, height)
-            click_point = [click_point[0] * image_input.size[0], click_point[1] * image_input.size[1]]
-            image_som = plot_circles_with_marks(
                 image_input,
-                [click_point],
-                som_generator,
                 edgecolor=(255,127,111),
-                linewidth=3,
-                fn_save=None,
                 normalized_to_pixel=False,
                 add_mark=False
             )
-        except:
-            image_som = image_input
-    return image_som, str(parsed_content_list)
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
@@ -291,4 +297,4 @@ with gr.Blocks() as demo:
 demo.launch(debug=True, show_error=True, share=True)
 # demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
-# demo.queue().launch(share=False)

+import traceback
 from typing import Optional
 import spaces
 import gradio as gr
 DEVICE = torch.device('cuda')
 @spaces.GPU
+# @torch.inference_mode()
 def get_som_response(instruction, image_som):
     prompt = magma_som_prompt.format(instruction)
     if magam_model.config.mm_use_image_start_end:
     return response
 @spaces.GPU
+# @torch.inference_mode()
 def get_qa_response(instruction, image):
     prompt = magma_qa_prompt.format(instruction)
     if magam_model.config.mm_use_image_start_end:
     return response
 @spaces.GPU
+# @torch.inference_mode()
 # @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def process(
     image_input,
     instruction,
 ) -> Optional[Image.Image]:
+    try:
+        # image_save_path = 'imgs/saved_image_demo.png'
+        # image_input.save(image_save_path)
+        # image = Image.open(image_save_path)
+        box_overlay_ratio = image_input.size[0] / 3200
+        draw_bbox_config = {
+            'text_scale': 0.8 * box_overlay_ratio,
+            'text_thickness': max(int(2 * box_overlay_ratio), 1),
+            'text_padding': max(int(3 * box_overlay_ratio), 1),
+            'thickness': max(int(3 * box_overlay_ratio), 1),
+        }
+        ocr_bbox_rslt, is_goal_filtered = check_ocr_box(image_input, display_img = False, output_bb_format='xyxy', goal_filtering=None, easyocr_args={'paragraph': False, 'text_threshold':0.9}, use_paddleocr=use_paddleocr)
+        text, ocr_bbox = ocr_bbox_rslt
+        dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(image_input, yolo_model, BOX_TRESHOLD = box_threshold, output_coord_in_ratio=False, ocr_bbox=ocr_bbox,draw_bbox_config=draw_bbox_config, caption_model_processor=caption_model_processor, ocr_text=text,iou_threshold=iou_threshold, imgsz=imgsz,)
+        parsed_content_list = '\n'.join([f'icon {i}: ' + str(v) for i,v in enumerate(parsed_content_list)])
+        if len(instruction) == 0:
+            print('finish processing')
+            image = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
+            return image, str(parsed_content_list)
+        elif instruction.startswith('Q:'):
+            response = get_qa_response(instruction, image_input)
+            return image_input, response
+        # parsed_content_list = str(parsed_content_list)
+        # convert xywh to yxhw
+        label_coordinates_yxhw = {}
+        for key, val in label_coordinates.items():
+            if val[2] < 0 or val[3] < 0:
+                continue
+            label_coordinates_yxhw[key] = [val[1], val[0], val[3], val[2]]
+        image_som = plot_boxes_with_marks(image_input.copy(), [val for key, val in label_coordinates_yxhw.items()], som_generator, edgecolor=(255,0,0), fn_save=None, normalized_to_pixel=False)
+        # convert xywh to xyxy
+        for key, val in label_coordinates.items():
+            label_coordinates[key] = [val[0], val[1], val[0] + val[2], val[1] + val[3]]
+        # normalize label_coordinates
+        for key, val in label_coordinates.items():
+            label_coordinates[key] = [val[0] / image_input.size[0], val[1] / image_input.size[1], val[2] / image_input.size[0], val[3] / image_input.size[1]]
+        magma_response = get_som_response(instruction, image_som)
+        print("magma repsonse: ", magma_response)
+        # map magma_response into the mark id
+        mark_id = extract_mark_id(magma_response)
+        if mark_id is not None:
+            if str(mark_id) in label_coordinates:
+                bbox_for_mark = label_coordinates[str(mark_id)]
+            else:
+                bbox_for_mark = None
         else:
             bbox_for_mark = None
+        if bbox_for_mark:
+            # draw bbox_for_mark on the image
+            image_som = plot_boxes_with_marks(
                 image_input,
+                [label_coordinates_yxhw[str(mark_id)]],
+                som_generator,
                 edgecolor=(255,127,111),
+                alpha=30,
+                fn_save=None,
                 normalized_to_pixel=False,
                 add_mark=False
             )
+        else:
+            try:
+                if 'box' in magma_response:
+                    pred_bbox = extract_bbox(magma_response)
+                    click_point = [(pred_bbox[0][0] + pred_bbox[1][0]) / 2, (pred_bbox[0][1] + pred_bbox[1][1]) / 2]
+                    click_point = [item / 1000 for item in click_point]
+                else:
+                    click_point = pred_2_point(magma_response)
+                # de-normalize click_point (width, height)
+                click_point = [click_point[0] * image_input.size[0], click_point[1] * image_input.size[1]]
+                image_som = plot_circles_with_marks(
+                    image_input,
+                    [click_point],
+                    som_generator,
+                    edgecolor=(255,127,111),
+                    linewidth=3,
+                    fn_save=None,
+                    normalized_to_pixel=False,
+                    add_mark=False
+                )
+            except:
+                image_som = image_input
+        return image_som, str(parsed_content_list)
+    except Exception as e:
+        print('error in process')
+        traceback.print_exc()
+        return image_input, 'error in process'
 with gr.Blocks() as demo:
     gr.Markdown(MARKDOWN)
 demo.launch(debug=True, show_error=True, share=True)
 # demo.launch(share=True, server_port=7861, server_name='0.0.0.0')
+# demo.queue().launch(share=False)