Spaces:

TIGER-Lab
/

MAmmoTH-VL2

Running on Zero

App Files Files Community

wenhu commited on May 4

Commit

d8b1b85

verified ·

1 Parent(s): 3e0e0e0

Update app_test.py

Browse files

Files changed (1) hide show

app_test.py +515 -6

app_test.py CHANGED Viewed

@@ -63,10 +63,399 @@ external_log_dir = "./logs"
 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
 @spaces.GPU
-def bot():
-    print(f"### Chatbot instance ID")
 with gr.Blocks(
@@ -112,7 +501,127 @@ with gr.Blocks(
             regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
             clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
-    bot()
 demo.queue()
@@ -136,8 +645,8 @@ if __name__ == "__main__":
     model_path = args.model_path
     filt_invalid = "cut"
-    #model_name = get_model_name_from_path(args.model_path)
-    #tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
-    #model=model.to(torch.device('cuda'))
     chat_image_num = 0
     demo.launch()

 LOGDIR = external_log_dir
 VOTEDIR = "./votes"
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_conv.json")
+    return name
+def get_conv_vote_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(VOTEDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-user_vote.json")
+    if not os.path.isfile(name):
+        os.makedirs(os.path.dirname(name), exist_ok=True)
+    return name
+def vote_last_response(state, vote_type, model_selector):
+    with open(get_conv_vote_filename(), "a") as fout:
+        data = {
+            "type": vote_type,
+            "model": model_selector,
+            "state": state,
+        }
+        fout.write(json.dumps(data) + "\n")
+    api.upload_file(
+        path_or_fileobj=get_conv_vote_filename(),
+        path_in_repo=get_conv_vote_filename().replace("./votes/", ""),
+        repo_id=repo_name,
+        repo_type="dataset")
+def upvote_last_response(state):
+    vote_last_response(state, "upvote", "MAmmoTH-VL2")
+    gr.Info("Thank you for your voting!")
+    return state
+def downvote_last_response(state):
+    vote_last_response(state, "downvote", "MAmmoTH-VL2")
+    gr.Info("Thank you for your voting!")
+    return state
+class InferenceDemo(object):
+    def __init__(
+        self, args, model_path, tokenizer, model, image_processor, context_len
+    ) -> None:
+        disable_torch_init()
+        self.tokenizer, self.model, self.image_processor, self.context_len = (
+            tokenizer,
+            model,
+            image_processor,
+            context_len,
+        )
+        if "llama-2" in model_name.lower():
+            conv_mode = "llava_llama_2"
+        elif "v1" in model_name.lower():
+            conv_mode = "llava_v1"
+        elif "mpt" in model_name.lower():
+            conv_mode = "mpt"
+        elif "qwen" in model_name.lower():
+            conv_mode = "qwen_1_5"
+        elif "pangea" in model_name.lower():
+            conv_mode = "qwen_1_5"
+        elif "mammoth-vl" in model_name.lower():
+            conv_mode = "qwen_2_5"
+        else:
+            conv_mode = "llava_v0"
+        if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print(
+                "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                    conv_mode, args.conv_mode, args.conv_mode
+                )
+            )
+        else:
+            args.conv_mode = conv_mode
+        self.conv_mode = conv_mode
+        self.conversation = conv_templates[args.conv_mode].copy()
+        self.num_frames = args.num_frames
+class ChatSessionManager:
+    def __init__(self):
+        self.chatbot_instance = None
+    def initialize_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
+        self.chatbot_instance = InferenceDemo(args, model_path, tokenizer, model, image_processor, context_len)
+        print(f"Initialized Chatbot instance with ID: {id(self.chatbot_instance)}")
+    def reset_chatbot(self):
+        self.chatbot_instance = None
+    def get_chatbot(self, args, model_path, tokenizer, model, image_processor, context_len):
+        if self.chatbot_instance is None:
+            self.initialize_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
+        return self.chatbot_instance
+def is_valid_video_filename(name):
+    video_extensions = ["avi", "mp4", "mov", "mkv", "flv", "wmv", "mjpeg"]
+    ext = name.split(".")[-1].lower()
+    if ext in video_extensions:
+        return True
+    else:
+        return False
+def is_valid_image_filename(name):
+    image_extensions = ["jpg", "jpeg", "png", "bmp", "gif", "tiff", "webp", "heic", "heif", "jfif", "svg", "eps", "raw"]
+    ext = name.split(".")[-1].lower()
+    if ext in image_extensions:
+        return True
+    else:
+        return False
+def sample_frames_v1(video_file, num_frames):
+    video = cv2.VideoCapture(video_file)
+    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    interval = total_frames // num_frames
+    frames = []
+    for i in range(total_frames):
+        ret, frame = video.read()
+        pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        if not ret:
+            continue
+        if i % interval == 0:
+            frames.append(pil_img)
+    video.release()
+    return frames
+def sample_frames_v2(video_path, frame_count=32):
+    video_frames = []
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frames = len(vr)
+    frame_interval = max(total_frames // frame_count, 1)
+    for i in range(0, total_frames, frame_interval):
+        frame = vr[i].asnumpy()
+        frame_image = Image.fromarray(frame)  # Convert to PIL.Image
+        video_frames.append(frame_image)
+        if len(video_frames) >= frame_count:
+            break
+    # Ensure at least one frame is returned if total frames are less than required
+    if len(video_frames) < frame_count and total_frames > 0:
+        for i in range(total_frames):
+            frame = vr[i].asnumpy()
+            frame_image = Image.fromarray(frame)  # Convert to PIL.Image
+            video_frames.append(frame_image)
+            if len(video_frames) >= frame_count:
+                break
+    return video_frames
+def sample_frames(video_path, num_frames=8):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+    for i in indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame))
+    cap.release()
+    return frames
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        if response.status_code == 200:
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+        else:
+            print("failed to load the image")
+    else:
+        print("Load image from local file")
+        print(image_file)
+        image = Image.open(image_file).convert("RGB")
+    return image
+def clear_response(history):
+    for index_conv in range(1, len(history)):
+        # loop until get a text response from our model.
+        conv = history[-index_conv]
+        if not (conv[0] is None):
+            break
+    question = history[-index_conv][0]
+    history = history[:-index_conv]
+    return history, question
+chat_manager = ChatSessionManager()
+def clear_history(history):
+    chatbot_instance = chat_manager.get_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
+    chatbot_instance.conversation = conv_templates[chatbot_instance.conv_mode].copy()
+    return None
+def add_message(history, message):
+    global chat_image_num
+    print("#### len(history)",len(history))
+    if not history:
+        history = []
+        our_chatbot = chat_manager.get_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
+        chat_image_num = 0
+    for x in message["files"]:
+        if "realcase_video.jpg" in x:
+            x = x.replace("realcase_video.jpg", "realcase_video.mp4")
+        history.append(((x,), None))
+    if message["text"] is not None:
+        history.append((message["text"], None))
+    # print(f"### Chatbot instance ID: {id(our_chatbot)}")
+    return history, gr.MultimodalTextbox(value=None, interactive=False)
 @spaces.GPU
+def bot(history, temperature, top_p, max_output_tokens):
+    our_chatbot = chat_manager.get_chatbot(args, model_path, tokenizer, model, image_processor, context_len)
+    print(f"### Chatbot instance ID: {id(our_chatbot)}")
+    text = history[-1][0]
+    images_this_term = []
+    text_this_term = ""
+    is_video = False
+    num_new_images = 0
+    # previous_image = False
+    for i, message in enumerate(history[:-1]):
+        if type(message[0]) is tuple:
+            images_this_term.append(message[0][0])
+            if is_valid_video_filename(message[0][0]):
+                num_new_images += 1
+                is_video = True
+            elif is_valid_image_filename(message[0][0]):
+                print("#### Load image from local file",message[0][0])
+                num_new_images += 1
+            else:
+                raise ValueError("Invalid file format")
+        else:
+            num_new_images = 0
+    image_list = []
+    for f in images_this_term:
+        if is_valid_video_filename(f):
+            image_list += sample_frames(f, our_chatbot.num_frames)
+        elif is_valid_image_filename(f):
+            image_list.append(load_image(f))
+        else:
+            raise ValueError("Invalid image file")
+    all_image_hash = []
+    all_image_path = []
+    for file_path in images_this_term:
+        with open(file_path, "rb") as file:
+            file_data = file.read()
+            file_hash = hashlib.md5(file_data).hexdigest()
+            all_image_hash.append(file_hash)
+            t = datetime.datetime.now()
+            output_dir = os.path.join(
+                LOGDIR,
+                "serve_files",
+                f"{t.year}-{t.month:02d}-{t.day:02d}"
+            )
+            os.makedirs(output_dir, exist_ok=True)
+            if is_valid_image_filename(file_path):
+                # Process and save images
+                image = Image.open(file_path).convert("RGB")
+                filename = os.path.join(output_dir, f"{file_hash}.jpg")
+                all_image_path.append(filename)
+                if not os.path.isfile(filename):
+                    print("Image saved to", filename)
+                    image.save(filename)
+            elif is_valid_video_filename(file_path):
+                # Simplified video saving
+                filename = os.path.join(output_dir, f"{file_hash}.mp4")
+                all_image_path.append(filename)
+                if not os.path.isfile(filename):
+                    print("Video saved to", filename)
+                    os.makedirs(os.path.dirname(filename), exist_ok=True)
+                    # Directly copy the video file
+                    with open(file_path, "rb") as src, open(filename, "wb") as dst:
+                        dst.write(src.read())
+    image_tensor = []
+    if is_video:
+        image_tensor = our_chatbot.image_processor.preprocess(image_list, return_tensors="pt")["pixel_values"].half().to(our_chatbot.model.device)
+    elif num_new_images > 0:
+        image_tensor = [
+            our_chatbot.image_processor.preprocess(f, return_tensors="pt")["pixel_values"][
+                0
+            ]
+            .half()
+            .to(our_chatbot.model.device)
+            for f in image_list
+        ]
+        image_tensor = torch.stack(image_tensor)
+    image_token = DEFAULT_IMAGE_TOKEN * num_new_images + "\n"
+    inp = text
+    inp = image_token + inp
+    our_chatbot.conversation.append_message(our_chatbot.conversation.roles[0], inp)
+    # image = None
+    our_chatbot.conversation.append_message(our_chatbot.conversation.roles[1], None)
+    prompt = our_chatbot.conversation.get_prompt()
+    input_ids = tokenizer_image_token(
+            prompt, our_chatbot.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
+        ).unsqueeze(0).to(our_chatbot.model.device)
+    # print("### input_id",input_ids)
+    stop_str = (
+        our_chatbot.conversation.sep
+        if our_chatbot.conversation.sep_style != SeparatorStyle.TWO
+        else our_chatbot.conversation.sep2
+    )
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(
+        keywords, our_chatbot.tokenizer, input_ids
+    )
+    streamer = TextIteratorStreamer(
+        our_chatbot.tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    if is_video:
+        input_image_tensor = [image_tensor]
+    elif num_new_images > 0:
+        input_image_tensor = image_tensor
+    else:
+        input_image_tensor = None
+    generate_kwargs = dict(
+        inputs=input_ids,
+        streamer=streamer,
+        images=input_image_tensor,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        max_new_tokens=max_output_tokens,
+        use_cache=False,
+        stopping_criteria=[stopping_criteria],
+        modalities=["video"] if is_video else ["image"]
+    )
+    t = Thread(target=our_chatbot.model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for stream_token in streamer:
+        outputs.append(stream_token)
+        history[-1] = [text, "".join(outputs)]
+        yield history
+    our_chatbot.conversation.messages[-1][-1] = "".join(outputs)
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "type": "chat",
+            "model": "MAmmoTH-VL2",
+            "state": history,
+            "images": all_image_hash,
+            "images_path": all_image_path
+        }
+        print("#### conv log",data)
+        fout.write(json.dumps(data) + "\n")
+    for upload_img in all_image_path:
+        api.upload_file(
+            path_or_fileobj=upload_img,
+            path_in_repo=upload_img.replace("./logs/", ""),
+            repo_id=repo_name,
+            repo_type="dataset",
+            # revision=revision,
+            # ignore_patterns=["data*"]
+        )
+    # upload json
+    api.upload_file(
+        path_or_fileobj=get_conv_log_filename(),
+        path_in_repo=get_conv_log_filename().replace("./logs/", ""),
+        repo_id=repo_name,
+        repo_type="dataset")
 with gr.Blocks(
             regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
             clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
+        chat_input = gr.MultimodalTextbox(
+            interactive=True,
+            file_types=["image", "video"],
+            placeholder="Enter message or upload file...",
+            show_label=False,
+            submit_btn="🚀"
+        )
+        gr.Examples(
+            examples_per_page=20,
+            examples=[
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/172197131626056_P7966202.png",
+                        ],
+                        "text": "Why this image funny?",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_doc.png",
+                        ],
+                        "text": "Read text in the image",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_weather.jpg",
+                        ],
+                        "text": "List the weather for Monday to Friday",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_knowledge.jpg",
+                        ],
+                        "text": "Answer the following question based on the provided image: What country do these planes belong to?",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_math.jpg",
+                        ],
+                        "text": "Find the measure of angle 3. Please provide a step by step solution.",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_interact.jpg",
+                        ],
+                        "text": "Please perfectly describe this cartoon illustration in as much detail as possible",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_perfer.jpg",
+                        ],
+                        "text": "This is an image of a room. It could either be a real image captured in the room or a rendered image from a 3D scene reconstruction technique that is trained using real images of the room. A rendered image usually contains some visible artifacts (eg. blurred regions due to under-reconstructed areas) that do not faithfully represent the actual scene. You need to decide if its a real image or a rendered image by giving each image a photorealism score between 1 and 5.",
+                    }
+                ],
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_multi1.png",
+                            f"{cur_dir}/examples/realcase_multi2.png",
+                            f"{cur_dir}/examples/realcase_multi3.png",
+                            f"{cur_dir}/examples/realcase_multi4.png",
+                            f"{cur_dir}/examples/realcase_multi5.png",
+                        ],
+                        "text": "Based on the five species in the images, draw a food chain. Explain the role of each species in the food chain.",
+                    }
+                ],
+            ],
+            inputs=[chat_input],
+            label="Real World Image Cases",
+        )
+        gr.Examples(
+            examples=[
+                [
+                    {
+                        "files": [
+                            f"{cur_dir}/examples/realcase_video.mp4",
+                        ],
+                        "text": "Please describe the video in detail.",
+                    },
+                ]
+            ],
+            inputs=[chat_input],
+            label="Real World Video Case"
+        )
+        gr.Markdown(tos_markdown)
+        gr.Markdown(learn_more_markdown)
+        gr.Markdown(bibtext)
+    chat_input.submit(
+        add_message, [chatbot, chat_input], [chatbot, chat_input]
+    ).then(bot, [chatbot, temperature, top_p, max_output_tokens], chatbot, api_name="bot_response").then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+    # chatbot.like(print_like_dislike, None, None)
+    clear_btn.click(
+        fn=clear_history, inputs=[chatbot], outputs=[chatbot], api_name="clear_all"
+    )
+    upvote_btn.click(
+        fn=upvote_last_response, inputs=chatbot, outputs=chatbot, api_name="upvote_last_response"
+    )
+    downvote_btn.click(
+        fn=downvote_last_response, inputs=chatbot, outputs=chatbot, api_name="upvote_last_response"
+    )
 demo.queue()
     model_path = args.model_path
     filt_invalid = "cut"
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit)
+    model=model.to(torch.device('cuda'))
     chat_image_num = 0
     demo.launch()