Spaces:

SNUMPR
/

vlm-rlaif-demo

Paused

App Files Files

dcahn12 commited on Aug 5, 2024

Commit

a2ad303

1 Parent(s): d902cee

Edit server file

Browse files

Files changed (3) hide show

gradio_utils.py +0 -18
gradio_web_server copy.py +0 -227
gradio_web_server.py +0 -42

gradio_utils.py CHANGED Viewed

@@ -11,9 +11,6 @@ from llava.model.builder import load_pretrained_model
 from llava.utils import disable_torch_init
 import shutil
-#   <a href="https://github.com/SNUMPR/vlm-rlaif.git" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
-#     <img src="https://z1.ax1x.com/2023/11/07/pil4sqH.png" alt="VLM-RLAIF" style="max-width: 120px; height: auto;">
-#   </a>
 cur_dir = os.path.dirname(os.path.abspath(__file__))
 title_markdown = ("""
@@ -34,7 +31,6 @@ title_markdown = ("""
     </div>
 </div>
 """)
-        # <a href='https://github.com/PKU-YuanGroup/Video-LLaVA/stargazers'><img src='https://img.shields.io/github/stars/PKU-YuanGroup/Video-LLaVA.svg?style=social'></a> # arXiv 버튼 옆에 추가?
 block_css = """
 #buttons button {
@@ -58,15 +54,9 @@ The service is a research preview intended for non-commercial use only, subject
 class Chat:
     def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False, device='cuda', cache_dir=None):
-        # model_base = '/dataset/yura/vlm-rlaif/pretrained/final_models/Video_LLaVA_SFT'
-        # model_base='/dataset/yura/vlm-rlaif/pretrained/llava-v1.5-7b-lora_w_lora_16_sftv2_short1632_and_then_long_rank32_alpha32_lr1e4_allmodels/SFT_merged'
-        # model_path = '/dataset/yura/vlm-rlaif/pretrained/LLaVA_Video-RL-Fact-RLHF-7b_SFTv2_RM_13b_v1_40k-v1.5-336-lora-padding/checkpoint-180/adapter_model/lora_policy'
         disable_torch_init()
         model_name = get_model_name_from_path(model_path)
-        # self.tokenizer, self.model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name,
-        #                                                                            load_8bit, load_4bit,
-        #                                                                            device=device, cache_dir=cache_dir)
         is_rlhf_checkpoint = 'rlhf' in model_path.lower()
         print("MODEL_PATH", model_path)
         print("RLHF Checkpoint: ", is_rlhf_checkpoint)
@@ -79,16 +69,11 @@ class Chat:
                 shutil.copy(os.path.join(model_base, "config.json"), os.path.join(model_path, "config.json")) # Copy SFT model's config -> to RLHF folder
                 print("Listed", os.listdir(model_path))
                 print("Copying done")
-            # return(model_name)
-        # return
-        # self.tokenizer, self.model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, load_8bit, load_4bit, device=device)
         self.tokenizer, self.model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, False, False, device=device)
         self.image_processor = image_processor
-        # self.image_processor = processor['image']
-        # self.video_processor = processor['video']
         self.conv_mode = conv_mode
         self.conv = conv_templates[conv_mode].copy()
         self.device = self.model.device
@@ -114,9 +99,6 @@ class Chat:
         latest_state = self._get_latest_prompt(state)
         prompt = latest_state.get_prompt()
-        # print('\n\n\n')
-        # print(prompt)
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
         temperature = 0.2

 from llava.utils import disable_torch_init
 import shutil
 cur_dir = os.path.dirname(os.path.abspath(__file__))
 title_markdown = ("""
     </div>
 </div>
 """)
 block_css = """
 #buttons button {
 class Chat:
     def __init__(self, model_path, conv_mode, model_base=None, load_8bit=False, load_4bit=False, device='cuda', cache_dir=None):
         disable_torch_init()
         model_name = get_model_name_from_path(model_path)
         is_rlhf_checkpoint = 'rlhf' in model_path.lower()
         print("MODEL_PATH", model_path)
         print("RLHF Checkpoint: ", is_rlhf_checkpoint)
                 shutil.copy(os.path.join(model_base, "config.json"), os.path.join(model_path, "config.json")) # Copy SFT model's config -> to RLHF folder
                 print("Listed", os.listdir(model_path))
                 print("Copying done")
         self.tokenizer, self.model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, False, False, device=device)
         self.image_processor = image_processor
         self.conv_mode = conv_mode
         self.conv = conv_templates[conv_mode].copy()
         self.device = self.model.device
         latest_state = self._get_latest_prompt(state)
         prompt = latest_state.get_prompt()
         input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
         temperature = 0.2

gradio_web_server copy.py DELETED Viewed

@@ -1,227 +0,0 @@
-import shutil
-import subprocess
-import torch
-import gradio as gr
-from fastapi import FastAPI
-import os
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-from PIL import Image
-import tempfile
-from decord import VideoReader, cpu
-from transformers import TextStreamer
-import argparse
-import sys
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "Evaluation"))
-from llava.constants import DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
-from llava.conversation import conv_templates, SeparatorStyle, Conversation
-from llava.mm_utils import process_images
-from Evaluation.infer_utils import load_video_into_frames
-from serve.utils import load_image, image_ext, video_ext
-from serve.gradio_utils import Chat, tos_markdown, learn_more_markdown, title_markdown, block_css
-def save_image_to_local(image):
-    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
-    image = Image.open(image)
-    image.save(filename)
-    # print(filename)
-    return filename
-def save_video_to_local(video_path):
-    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
-    shutil.copyfile(video_path, filename)
-    return filename
-def generate(image1, video, textbox_in, first_run, state, state_, images_tensor, num_frames=50):
-    # ======= manually clear the conversation
-    # state = conv_templates[conv_mode].copy()
-    # state_ = conv_templates[conv_mode].copy()
-    # # =======
-    flag = 1
-    if not textbox_in:
-        if len(state_.messages) > 0:
-            textbox_in = state_.messages[-1][1]
-            state_.messages.pop(-1)
-            flag = 0
-        else:
-            return "Please enter instruction"
-    print("Video", video) # 잘 들어감
-    print("Images_tensor", images_tensor) # None
-    print("Textbox_IN", textbox_in) # 잘 들어감
-    print("State", state) # None
-    print("State_", state_) # None
-    # print(len(state_.messages))
-    video = video if video else "none"
-    if type(state) is not Conversation:
-        state = conv_templates[conv_mode].copy()
-        state_ = conv_templates[conv_mode].copy()
-        images_tensor = []
-    first_run = False if len(state.messages) > 0 else True
-    text_en_in = textbox_in.replace("picture", "image")
-    image_processor = handler.image_processor
-    assert os.path.exists(video)
-    if os.path.splitext(video)[-1].lower() in video_ext: # video extension
-        video_decode_backend = 'opencv'
-    elif os.path.splitext(os.listdir(video)[0]).lower() in image_ext: # frames folder
-        video_decode_backend = 'frames'
-    else:
-        raise ValueError(f'Support video of {video_ext} and frames of {image_ext}, but found {os.path.splitext(video)[-1].lower()}')
-    frames = load_video_into_frames(video, video_decode_backend=video_decode_backend, num_frames=num_frames)
-    tensor = process_images(frames, image_processor, argparse.Namespace(image_aspect_ratio='pad'))
-    # tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
-    # print(tensor.shape)
-    tensor = tensor.to(handler.model.device, dtype=dtype)
-    # images_tensor.append(tensor)
-    images_tensor = tensor
-    if handler.model.config.mm_use_im_start_end:
-        text_en_in = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + text_en_in
-    else:
-        text_en_in = DEFAULT_IMAGE_TOKEN + '\n' + text_en_in
-    text_en_out, state_ = handler.generate(images_tensor, text_en_in, first_run=first_run, state=state_)
-    state_.messages[-1] = (state_.roles[1], text_en_out)
-    text_en_out = text_en_out.split('#')[0]
-    textbox_out = text_en_out
-    show_images = ""
-    if os.path.exists(video):
-        filename = save_video_to_local(video)
-        show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={filename}"></video>'
-    if flag:
-        state.append_message(state.roles[0], textbox_in + "\n" + show_images)
-    state.append_message(state.roles[1], textbox_out)
-    return (state, state_, state.to_gradio_chatbot(), False, gr.update(value=None, interactive=True), images_tensor, gr.update(value=image1 if os.path.exists(video) else None, interactive=True), gr.update(value=video if os.path.exists(video) else None, interactive=True))
-def regenerate(state, state_):
-    state.messages.pop(-1)
-    state_.messages.pop(-1)
-    if len(state.messages) > 0:
-        return state, state_, state.to_gradio_chatbot(), False
-    return (state, state_, state.to_gradio_chatbot(), True)
-def clear_history(state, state_):
-    state = conv_templates[conv_mode].copy()
-    state_ = conv_templates[conv_mode].copy()
-    return (gr.update(value=None, interactive=True),
-            gr.update(value=None, interactive=True), \
-            gr.update(value=None, interactive=True), \
-            True, state, state_, state.to_gradio_chatbot(), [])
-# ==== CHANGE HERE ====
-# conv_mode = "llava_v1"
-# model_path = 'LanguageBind/Video-LLaVA-7B'
-# FIXME!!!
-conv_mode = "llava_v0"
-model_path = 'SNUMPR/vlm_rlaif_video_llava_7b'
-# model_path = '/dataset/yura/vlm-rlaif/pretrained/final_models/Video_LLaVA_VLM_RLAIF_merged'
-cache_dir = './cache_dir'
-device = 'cuda'
-# device = 'cpu'
-load_8bit = True
-load_4bit = False
-dtype = torch.float16
-# =============
-handler = Chat(model_path, conv_mode=conv_mode, load_8bit=load_8bit, load_4bit=load_8bit, device=device, cache_dir=cache_dir)
-# handler.model.to(dtype=dtype)
-if not os.path.exists("temp"):
-    os.makedirs("temp")
-app = FastAPI()
-textbox = gr.Textbox(
-    show_label=False, placeholder="Enter text and press ENTER", container=False
-)
-with gr.Blocks(title='VLM-RLAIF', theme=gr.themes.Default(), css=block_css) as demo:
-    gr.Markdown(title_markdown)
-    state = gr.State()
-    state_ = gr.State()
-    first_run = gr.State()
-    images_tensor = gr.State()
-    image1 = gr.Image(label="Input Image", type="filepath")
-    with gr.Row():
-        with gr.Column(scale=3):
-            video = gr.Video(label="Input Video")
-            cur_dir = os.path.dirname(os.path.abspath(__file__))
-            gr.Examples(
-                examples=[
-                    [
-                        f"{cur_dir}/examples/sample_demo_1.mp4",
-                        "Why is this video funny?",
-                    ],
-                    [
-                        f"{cur_dir}/examples/sample_demo_3.mp4",
-                        "Can you identify any safety hazards in this video?"
-                    ],
-                    [
-                        f"{cur_dir}/examples/sample_demo_9.mp4",
-                        "Describe the video.",
-                    ],
-                    [
-                        f"{cur_dir}/examples/sample_demo_22.mp4",
-                        "Describe the activity in the video.",
-                    ],
-                ],
-                inputs=[video, textbox],
-            )
-        with gr.Column(scale=7):
-            chatbot = gr.Chatbot(label="VLM_RLAIF", bubble_full_width=True).style(height=750)
-            with gr.Row():
-                with gr.Column(scale=8):
-                    textbox.render()
-                with gr.Column(scale=1, min_width=50):
-                    submit_btn = gr.Button(
-                        value="Send", variant="primary", interactive=True
-                    )
-            with gr.Row(elem_id="buttons") as button_row:
-                upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
-                downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
-                flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
-                # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
-                regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
-                # clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
-    gr.Markdown(tos_markdown)
-    gr.Markdown(learn_more_markdown)
-    submit_btn.click(generate, [image1, video, textbox, first_run, state, state_, images_tensor],
-                     [state, state_, chatbot, first_run, textbox, images_tensor, image1, video])
-    # submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor],
-                    #  [state, state_, chatbot, first_run, textbox, images_tensor, video])
-    regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
-        generate, [image1, video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, image1, video])
-        # generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
-    # clear_btn.click(clear_history, [state, state_],
-    #                 [image1, video, textbox, first_run, state, state_, chatbot, images_tensor])
-                    # [video, textbox, first_run, state, state_, chatbot, images_tensor])
-# app = gr.mount_gradio_app(app, demo, path="/")
-# demo.launch(share=True)
-demo.launch()
-# uvicorn videollava.serve.gradio_web_server:app
-# python -m  videollava.serve.gradio_web_server

gradio_web_server.py CHANGED Viewed

@@ -26,7 +26,6 @@ def save_image_to_local(image):
     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
     image = Image.open(image)
     image.save(filename)
-    # print(filename)
     return filename
@@ -37,10 +36,6 @@ def save_video_to_local(video_path):
 def generate(video, textbox_in, first_run, state, state_, images_tensor, num_frames=50):
-    # ======= manually clear the conversation
-    # state = conv_templates[conv_mode].copy()
-    # state_ = conv_templates[conv_mode].copy()
-    # # =======
     flag = 1
     if not textbox_in:
         if len(state_.messages) > 0:
@@ -49,18 +44,6 @@ def generate(video, textbox_in, first_run, state, state_, images_tensor, num_fra
             flag = 0
         else:
             return "Please enter instruction"
-    # else:
-    #     if state is not None and state_ is not None:
-    #         # reset conversations
-    #         state.messages = []
-    #         state_.messages = []
-    print("Video", video) # 잘 들어감
-    print("Images_tensor", images_tensor) # None
-    print("Textbox_IN", textbox_in) # 잘 들어감
-    print("State", state) # None
-    print("State_", state_) # None
-    # print(len(state_.messages))
     video = video if video else "none"
@@ -84,10 +67,7 @@ def generate(video, textbox_in, first_run, state, state_, images_tensor, num_fra
     frames = load_video_into_frames(video, video_decode_backend=video_decode_backend, num_frames=num_frames)
     tensor = process_images(frames, image_processor, argparse.Namespace(image_aspect_ratio='pad'))
-    # tensor = video_processor(video, return_tensors='pt')['pixel_values'][0]
-    # print(tensor.shape)
     tensor = tensor.to(handler.model.device, dtype=dtype)
-    # images_tensor.append(tensor)
     images_tensor = tensor
     if handler.model.config.mm_use_im_start_end:
@@ -130,23 +110,16 @@ def clear_history(state, state_):
 # ==== CHANGE HERE ====
-# conv_mode = "llava_v1"
-# model_path = 'LanguageBind/Video-LLaVA-7B'
-# FIXME!!!
 conv_mode = "llava_v0"
 model_path = 'SNUMPR/vlm_rlaif_video_llava_7b'
-# model_path = '/dataset/yura/vlm-rlaif/pretrained/final_models/Video_LLaVA_VLM_RLAIF_merged'
 cache_dir = './cache_dir'
 device = 'cuda'
-# device = 'cpu'
 load_8bit = True
 load_4bit = False
 dtype = torch.float16
 # =============
 handler = Chat(model_path, conv_mode=conv_mode, load_8bit=load_8bit, load_4bit=load_8bit, device=device, cache_dir=cache_dir)
-# handler.model.to(dtype=dtype)
 if not os.path.exists("temp"):
     os.makedirs("temp")
@@ -163,7 +136,6 @@ with gr.Blocks(title='VLM-RLAIF', theme=gr.themes.Default(), css=block_css) as d
     first_run = gr.State()
     images_tensor = gr.State()
-    # image1 = gr.Image(label="Input Image", type="filepath")
     with gr.Row():
         with gr.Column(scale=3):
             video = gr.Video(label="Input Video")
@@ -204,28 +176,14 @@ with gr.Blocks(title='VLM-RLAIF', theme=gr.themes.Default(), css=block_css) as d
                 upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
                 downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
                 flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
-                # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
-                # clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor],
                      [state, state_, chatbot, first_run, textbox, images_tensor, video])
-    # submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor],
-                    #  [state, state_, chatbot, first_run, textbox, images_tensor, video])
     regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
         generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
-        # generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
-    # clear_btn.click(clear_history, [state, state_],
-    #                 [image1, video, textbox, first_run, state, state_, chatbot, images_tensor])
-                    # [video, textbox, first_run, state, state_, chatbot, images_tensor])
-# app = gr.mount_gradio_app(app, demo, path="/")
 demo.launch()
-# uvicorn videollava.serve.gradio_web_server:app
-# python -m  videollava.serve.gradio_web_server

     filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.jpg')
     image = Image.open(image)
     image.save(filename)
     return filename
 def generate(video, textbox_in, first_run, state, state_, images_tensor, num_frames=50):
     flag = 1
     if not textbox_in:
         if len(state_.messages) > 0:
             flag = 0
         else:
             return "Please enter instruction"
     video = video if video else "none"
     frames = load_video_into_frames(video, video_decode_backend=video_decode_backend, num_frames=num_frames)
     tensor = process_images(frames, image_processor, argparse.Namespace(image_aspect_ratio='pad'))
     tensor = tensor.to(handler.model.device, dtype=dtype)
     images_tensor = tensor
     if handler.model.config.mm_use_im_start_end:
 # ==== CHANGE HERE ====
 conv_mode = "llava_v0"
 model_path = 'SNUMPR/vlm_rlaif_video_llava_7b'
 cache_dir = './cache_dir'
 device = 'cuda'
 load_8bit = True
 load_4bit = False
 dtype = torch.float16
 # =============
 handler = Chat(model_path, conv_mode=conv_mode, load_8bit=load_8bit, load_4bit=load_8bit, device=device, cache_dir=cache_dir)
 if not os.path.exists("temp"):
     os.makedirs("temp")
     first_run = gr.State()
     images_tensor = gr.State()
     with gr.Row():
         with gr.Column(scale=3):
             video = gr.Video(label="Input Video")
                 upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
                 downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
                 flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
                 regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=True)
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     submit_btn.click(generate, [video, textbox, first_run, state, state_, images_tensor],
                      [state, state_, chatbot, first_run, textbox, images_tensor, video])
     regenerate_btn.click(regenerate, [state, state_], [state, state_, chatbot, first_run]).then(
         generate, [video, textbox, first_run, state, state_, images_tensor], [state, state_, chatbot, first_run, textbox, images_tensor, video])
 demo.launch()