Spaces:

embodied-generalist
/

LEO-Demo

Runtime error

App Files Files Community

huangjy-pku commited on Dec 25, 2023

Commit

7978a78

0 Parent(s):

init

Browse files

Files changed (15) hide show

.gitattributes +36 -0
.gitignore +2 -0
README.md +13 -0
app.py +181 -0
assets/leo.svg +0 -0
assets/obj_features/3RScan-0cac759b-8d6f-2d13-8e3b-2e3bc1ee1158.pth +3 -0
assets/obj_features/3RScan-0cac760d-8d6f-2d13-8ea2-109ce4da9ac9.pth +3 -0
assets/obj_features/3RScan-752cc597-920c-26f5-8c1b-a8a5c90a21d7.pth +3 -0
assets/scene_meshes/3RScan-0cac759b-8d6f-2d13-8e3b-2e3bc1ee1158.glb +3 -0
assets/scene_meshes/3RScan-0cac760d-8d6f-2d13-8ea2-109ce4da9ac9.glb +3 -0
assets/scene_meshes/3RScan-752cc597-920c-26f5-8c1b-a8a5c90a21d7.glb +3 -0
model/cfg.yaml +21 -0
model/leo_agent.py +210 -0
requirements.txt +7 -0
utils.py +184 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.glb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ logs/

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: LEO
+emoji: 🦁
+colorFrom: purple
+colorTo: blue
+sdk: gradio
+sdk_version: 4.10.0
+app_file: app.py
+pinned: false
+license: mit
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import gradio as gr
+from utils import *
+with gr.Blocks(title='LEO Demo') as demo:
+    gr.HTML(value="<h1 align='center'>An Embodied Generalist Agent in 3D World</h1>")
+    gr.HTML(value="<div align='center' style='margin-top:-1em; margin-bottom:-1em;'><img src='/file=assets/leo.svg' width='4%'></div>")
+    # gr.HTML(value="<img src='/file=assets/teaser.png' alt='Teaser' width='760px' style='display: block; margin: auto;'>")
+    gr.HTML(value="<p align='center' style='font-size: 1.2em; color: #485fc7;'><a href='https://arxiv.org/abs/2311.12871' target='_blank'>arXiv</a> | <a href='https://embodied-generalist.github.io/' target='_blank'>Project Page</a> | <a href='https://github.com/embodied-generalist/embodied-generalist' target='_blank'>Code</a></p>")
+    gr.HTML(value="<p align='center' style='font-size: 1.15em;'><i>LEO: an embodied generalist agent capable of perceiving, grounding, reasoning, planning, and acting in 3D world.</i></p>")
+    with gr.Row():
+        with gr.Column(scale=5):
+            dropdown_scene = gr.Dropdown(
+                choices=MESH_NAMES,
+                value=MESH_NAMES[0],
+                interactive=True,
+                label='Select a 3D scene',
+            )
+            model_3d = gr.Model3D(
+                value=os.path.join(MESH_DIR, f'{MESH_NAMES[0]}.glb'),
+                clear_color=[0.0, 0.0, 0.0, 0.0],
+                label='3D Scene',
+                camera_position=(90, 30, 10),
+                height=659,
+            )
+            gr.HTML(
+                """<center><strong>
+                👆 SCROLL and DRAG on the 3D Scene
+                to zoom in/out and rotate. Press CTRL and DRAG to pan.
+                </strong></center>
+                """
+            )
+        with gr.Column(scale=5):
+            dropdown_conversation_mode = gr.Dropdown(
+                choices=['Single-round mode', 'Multi-round mode'],
+                value='Single-round mode',
+                interactive=True,
+                label='Select conversation mode',
+            )
+            chatbot = gr.Chatbot(label='Chat with LEO')
+            with gr.Row():
+                with gr.Column(scale=8):
+                    user_chat_input = gr.Textbox(
+                        placeholder="Enter text here to chat with LEO",
+                        show_label=False,
+                        autofocus=True,
+                    )
+                with gr.Column(scale=2, min_width=0):
+                    send_button = gr.Button('Send', variant='primary', scale=2)
+            with gr.Row():
+                upvote_button = gr.Button(value='👍 Upvote', interactive=False)
+                downvote_button = gr.Button(value='👎 Downvote', interactive=False)
+                flag_button = gr.Button(value='⚠️ Flag', interactive=False)
+                clear_button = gr.Button(value='🗑️ Clear', interactive=False)
+            with gr.Row():
+                with gr.Accordion(label="Examples for user instruction:", open=True):
+                    gr.Examples(
+                        examples=[
+                            ["How many armchairs are there in this room?"],
+                            ["Is there a radio in the room?"],
+                            ["Where is the wardrobe located?TODO"],
+                            ["What is the shape of the shelf in front of the picture?TODO"],
+                            ["Plan for the task: Tidy up and arrange the nursery room.TODO"],
+                       ],
+                        inputs=user_chat_input,
+                    )
+    # generation_config
+    with gr.Accordion('Parameters', open=False):
+        repetition_penalty = gr.Slider(
+            minimum=0.0,
+            maximum=10.0,
+            value=3.0,
+            step=1.0,
+            interactive=True,
+            label='Repetition penalty',
+        )
+        length_penalty = gr.Slider(
+            minimum=0.0,
+            maximum=10.0,
+            value=1.0,
+            step=1.0,
+            interactive=True,
+            label="Length penalty",
+        )
+    gr.Markdown("### Terms of Service")
+    gr.HTML(
+        """By using this service, users are required to agree to the following terms:
+           the service is a research preview intended for non-commercial use only
+           and may collect user dialogue data for future research."""
+    )
+    gr.Markdown("### Acknowledgment")
+    gr.HTML(
+        """Template adapted from <a href="https://llava.hliu.cc/">LLaVA</a> and
+           <a href="http://sled-whistler.eecs.umich.edu:7777/">LLM-Grounder</a>."""
+    )
+    # Event handling
+    button_list = [upvote_button, downvote_button, flag_button, clear_button]
+    dropdown_scene.change(
+        fn=change_scene,
+        inputs=[dropdown_scene],
+        outputs=[model_3d, chatbot],
+        queue=False,
+    )
+    dropdown_conversation_mode.change(
+        fn=clear_history,
+        inputs=[],
+        outputs=[chatbot, user_chat_input] + button_list,
+        queue=False,
+    )
+    user_chat_input.submit(
+        fn=receive_instruction,
+        inputs=[chatbot, user_chat_input],
+        outputs=[chatbot, user_chat_input, send_button] + button_list,
+        queue=False,
+    ).then(
+        fn=generate_response,
+        inputs=[
+            chatbot,
+            dropdown_scene,
+            dropdown_conversation_mode,
+            repetition_penalty,
+            length_penalty,
+        ],
+        outputs=[chatbot, send_button] + button_list,
+        scroll_to_output=True,
+    )
+    send_button.click(
+        fn=receive_instruction,
+        inputs=[chatbot, user_chat_input],
+        outputs=[chatbot, user_chat_input, send_button] + button_list,
+        queue=False,
+    ).then(
+        fn=generate_response,
+        inputs=[
+            chatbot,
+            dropdown_scene,
+            dropdown_conversation_mode,
+            repetition_penalty,
+            length_penalty,
+        ],
+        outputs=[chatbot, send_button] + button_list,
+        scroll_to_output=True,
+    )
+    upvote_button.click(
+        upvote_response,
+        [chatbot, dropdown_scene, dropdown_conversation_mode],
+        [user_chat_input, upvote_button, downvote_button, flag_button],
+        queue=False,
+    )
+    downvote_button.click(
+        downvote_response,
+        [chatbot, dropdown_scene, dropdown_conversation_mode],
+        [user_chat_input, upvote_button, downvote_button, flag_button],
+        queue=False,
+    )
+    flag_button.click(
+        flag_response,
+        [chatbot, dropdown_scene, dropdown_conversation_mode],
+        [user_chat_input, upvote_button, downvote_button, flag_button],
+        queue=False,
+    )
+    clear_button.click(
+        fn=clear_history,
+        inputs=[],
+        outputs=[chatbot, user_chat_input] + button_list,
+        queue=False,
+    )
+demo.queue().launch(share=True, allowed_paths=['assets'])

assets/leo.svg ADDED Viewed

assets/obj_features/3RScan-0cac759b-8d6f-2d13-8e3b-2e3bc1ee1158.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5642bb84ba04d10c5aa199dbcd5ea1ab01df0d2517719a2a2e943381f11bd25b
+size 1002083

assets/obj_features/3RScan-0cac760d-8d6f-2d13-8ea2-109ce4da9ac9.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eae2324173b34331b6dad37c89a75db275d1d23fbb1f1d7478573085cdf1d733
+size 1002083

assets/obj_features/3RScan-752cc597-920c-26f5-8c1b-a8a5c90a21d7.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50a9e124ea270cbe23b59fbddb527d5cf61005c657bd3f5f41535998ba84d9b6
+size 1002083

assets/scene_meshes/3RScan-0cac759b-8d6f-2d13-8e3b-2e3bc1ee1158.glb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d197483b3be1f6f1395faa3a8b413ee23335fd8f081456b63db96f5928291b1
+size 9632176

assets/scene_meshes/3RScan-0cac760d-8d6f-2d13-8ea2-109ce4da9ac9.glb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:419988aa4781ec7d0a06e9087c8a918a20c389c50b210daa6b3c47be981b28ac
+size 9445868

assets/scene_meshes/3RScan-752cc597-920c-26f5-8c1b-a8a5c90a21d7.glb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0db74afa2648056c839840ba8a11d832012b6f70114668835c2da82d5ae07ec2
+size 11326324

model/cfg.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+use_ckpt: hf
+hf_ckpt_path: [huangjy-pku/embodied-generalist, weights/leo_noact_hf.pth]
+local_ckpt_path: /mnt/huangjiangyong/leo/hf_assets/weights/leo_noact_lora.pth
+model:
+  name: LeoAgentLLM
+  # vision modules omitted
+  llm:
+    name: Vicuna7B
+    use_ckpt: hf
+    hf_cfg_path: huangjy-pku/vicuna-7b
+    local_cfg_path: /mnt/huangjiangyong/vicuna-7b
+    truncation_side: right
+    prompt: ""
+    max_out_len: 256
+    max_context_len: 256   # for prompt_after_obj
+    lora:
+      flag: True
+      rank: 16
+      alpha: 16
+      target_modules: [q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj]
+      dropout: 0.0

model/leo_agent.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import torch
+import torch.nn as nn
+from huggingface_hub import snapshot_download
+from peft import get_peft_model, LoraConfig
+from transformers import LlamaForCausalLM, LlamaTokenizer
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+class LeoAgentLLM(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        if hasattr(cfg, 'model'):
+            cfg = cfg.model
+        # LLM
+        if cfg.llm.use_ckpt == 'hf':
+            llm_cfg_path = snapshot_download(cfg.llm.hf_cfg_path)
+        else:
+            llm_cfg_path = cfg.llm.local_cfg_path
+        self.llm_tokenizer = LlamaTokenizer.from_pretrained(llm_cfg_path, use_fast=False,
+                                                            truncation_side=cfg.llm.truncation_side)
+        self.llm_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.llm_tokenizer.add_special_tokens({'bos_token': '<s>'})
+        self.llm_tokenizer.add_special_tokens({'eos_token': '</s>'})
+        self.llm_tokenizer.add_special_tokens({'unk_token': '</s>'})
+        self.llm_model = LlamaForCausalLM.from_pretrained(llm_cfg_path, torch_dtype=torch.float16)
+        self.llm_model.resize_token_embeddings(len(self.llm_tokenizer))
+        for param in self.llm_model.parameters():
+            param.requires_grad = False
+        self.llm_model.eval()
+        self.llm_model.train = disabled_train
+        # LoRA-based LLM fine-tuning
+        if cfg.llm.lora.flag:
+            lora_config = LoraConfig(
+                r=cfg.llm.lora.rank,
+                lora_alpha=cfg.llm.lora.alpha,
+                target_modules=cfg.llm.lora.target_modules,
+                lora_dropout=cfg.llm.lora.dropout,
+                bias='none',
+                modules_to_save=[],
+            )
+            self.llm_model = get_peft_model(self.llm_model, peft_config=lora_config)
+        self.max_context_len = cfg.llm.max_context_len
+    @property
+    def device(self):
+        return list(self.parameters())[0].device
+    def build_right_justified_sequence(self, data_dict):
+        """
+        Concat six sequences: `prompt_before_obj`, `prompt_middle_1`, `img_tokens`, `prompt_middle_2`, `obj_tokens`, `prompt_after_obj`.
+        Return right justified sequence for causal LM: <pad>, <role/situation>, <img>, <objs>, <instruction>.
+        """
+        bs = len(data_dict['prompt_before_obj'])
+        self.llm_tokenizer.padding_side = 'left'
+        text_input_tokens_pre = self.llm_tokenizer(
+            data_dict['prompt_before_obj'],
+            return_tensors='pt',
+            padding='longest'
+        ).to(self.device)   # [PAD, BOS, tokens], (B, T1)
+        text_input_tokens_mid1 = self.llm_tokenizer(
+            data_dict['prompt_middle_1'],
+            return_tensors='pt',
+            padding='longest'
+        ).to(self.device)
+        img_tokens = data_dict['img_tokens'].to(self.device)
+        img_masks = data_dict['img_masks'].to(self.device)
+        img_masks = img_masks.reshape(-1, 1).repeat(1, img_tokens.size(1))
+        text_input_tokens_mid2 = self.llm_tokenizer(
+            data_dict['prompt_middle_2'],
+            return_tensors='pt',
+            padding='longest'
+        ).to(self.device)
+        obj_tokens = data_dict['obj_tokens'].to(self.device)
+        obj_masks = data_dict['obj_masks'].to(self.device)
+        self.llm_tokenizer.padding_side = 'right'   # no need to be 'left', as padding tokens will be shifted
+        self.llm_tokenizer.truncation_side = 'left'   # truncate history
+        text_input_tokens_post = self.llm_tokenizer(
+            data_dict['prompt_after_obj'],
+            return_tensors='pt',
+            padding='longest',
+            truncation=True,
+            max_length=self.max_context_len,
+        ).to(self.device)   # [BOS, tokens, PAD], (B, T3)
+        # hardcode, remove bos, make "tokenize subseq and concat" equivalent to "tokenize the whole seq"
+        assert text_input_tokens_mid1.attention_mask.all() and text_input_tokens_mid2.attention_mask.all(), \
+               "prompt_middle should be the same and thus no padding"
+        text_input_tokens_mid1.input_ids = text_input_tokens_mid1.input_ids[:, 1:]
+        text_input_tokens_mid1.attention_mask = text_input_tokens_mid1.attention_mask[:, 1:]
+        for i in range(bs):
+            if not img_masks[i].any():
+                # no image input, also mask the text prompt for image tokens
+                text_input_tokens_mid1.attention_mask[i].fill_(0)
+        text_input_tokens_mid2.input_ids[:, 0] = 869   # 1 (bos) -> 869 (▁.)
+        text_input_tokens_post.input_ids[:, 0] = 869   # 1 (bos) -> 869 (▁.)
+        inputs_embeds_pre = self.llm_model.get_input_embeddings()(text_input_tokens_pre.input_ids)
+        inputs_embeds_mid1 = self.llm_model.get_input_embeddings()(text_input_tokens_mid1.input_ids)
+        inputs_embeds_mid2 = self.llm_model.get_input_embeddings()(text_input_tokens_mid2.input_ids)
+        inputs_embeds_post = self.llm_model.get_input_embeddings()(text_input_tokens_post.input_ids)
+        # since img_tokens, prompt_mid, obj_tokens are fixed length without padding, we concat them first
+        inputs_embeds_mid = torch.cat([inputs_embeds_mid1, img_tokens, inputs_embeds_mid2, obj_tokens], dim=1)
+        attn_mask_mid = torch.cat([
+            text_input_tokens_mid1.attention_mask, img_masks,
+            text_input_tokens_mid2.attention_mask, obj_masks
+        ], dim=1)
+        post_pad_length = torch.logical_not(text_input_tokens_post.attention_mask).sum(-1)
+        bs, l1, hidden_dim = inputs_embeds_pre.shape
+        _, l2, _ = inputs_embeds_mid.shape
+        _, l3, _ = inputs_embeds_post.shape
+        inputs_embeds = torch.zeros(
+            bs, l1+l2+l3, hidden_dim
+        ).type(inputs_embeds_pre.dtype).to(self.device)
+        attention_mask = torch.zeros(
+            bs, l1+l2+l3
+        ).type(obj_masks.dtype).to(self.device)
+        # assign by chunks
+        for i in range(bs):
+            post_pad_len = post_pad_length[i]
+            if post_pad_len > 0:
+                inputs_embeds[i, :post_pad_len] = inputs_embeds_post[i, -post_pad_len:]
+                attention_mask[i, :post_pad_len] = 0
+                inputs_embeds[i, post_pad_len+l1+l2:] = inputs_embeds_post[i, :-post_pad_len]
+                attention_mask[i, post_pad_len+l1+l2:] = 1
+            else:
+                # no padding
+                inputs_embeds[i, -l3:] = inputs_embeds_post[i]
+                attention_mask[i, -l3:] = 1
+            inputs_embeds[i, post_pad_len: post_pad_len+l1] = inputs_embeds_pre[i]
+            attention_mask[i, post_pad_len: post_pad_len+l1] = text_input_tokens_pre.attention_mask[i]
+            inputs_embeds[i, post_pad_len+l1: post_pad_len+l1+l2] = inputs_embeds_mid[i]
+            attention_mask[i, post_pad_len+l1: post_pad_len+l1+l2] = attn_mask_mid[i]
+        return inputs_embeds, attention_mask
+    @torch.no_grad()
+    def generate(
+        self,
+        data_dict,
+        use_nucleus_sampling=False,
+        num_beams=5,
+        max_length=256,
+        min_length=1,
+        repetition_penalty=3.0,
+        length_penalty=1,
+        num_captions=1,
+        temperature=1,
+    ):
+        assert 'img_tokens' in data_dict and 'obj_tokens' in data_dict, "Visual features should have been processed offline."
+        inputs_embeds, attention_mask = self.build_right_justified_sequence(data_dict=data_dict)
+        bs = inputs_embeds.shape[0]
+        # give bos token as condition
+        bos_tokens = self.llm_tokenizer(
+            [self.llm_tokenizer.bos_token] * bs,
+            return_tensors='pt',
+        ).to(self.device)
+        bos_tokens_ids = bos_tokens.input_ids[:, 0:1]   # (B, 1)
+        bos_tokens_attn = bos_tokens.attention_mask[:, 0:1]   # (B, 1)
+        # prepare a `bos_token`
+        bos_embeds = self.llm_model.get_input_embeddings()(bos_tokens_ids)   # (B, 1, D)
+        inputs_embeds = torch.cat([inputs_embeds, bos_embeds], dim=1)   # (B, T1+O+T2+1, D)
+        attention_mask = torch.cat([attention_mask, bos_tokens_attn], dim=1)   # (B, T1+O+T2+1)
+        outputs = self.llm_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            do_sample=use_nucleus_sampling,
+            temperature=temperature,
+            num_beams=num_beams,
+            max_length=max_length,
+            min_length=min_length,
+            repetition_penalty=repetition_penalty,
+            length_penalty=length_penalty,
+            num_return_sequences=num_captions,
+        )
+        outputs[outputs == 0] = 2   # convert output id 0 (unk_token) to 2 (eos_token)
+        output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        output_text = [text.strip() for text in output_text]
+        return output_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+--extra-index-url https://download.pytorch.org/whl/cu116
+omegaconf==2.3.0
+peft==0.5.0
+pyyaml==6.0.1
+sentencepiece
+torch==1.13.0+cu116
+transformers==4.28.1

utils.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import datetime
+import json
+import os
+import time
+import gradio as gr
+import torch
+import yaml
+from huggingface_hub import hf_hub_download
+from omegaconf import OmegaConf
+from model.leo_agent import LeoAgentLLM
+LOG_DIR = 'logs'
+MESH_DIR = 'assets/scene_meshes'
+MESH_NAMES = [os.path.splitext(fname)[0] for fname in os.listdir(MESH_DIR)]
+ENABLE_BUTTON = gr.update(interactive=True)
+DISABLE_BUTTON = gr.update(interactive=False)
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+ROLE_PROMPT = "You are an AI visual assistant situated in a 3D scene. "\
+              "You can perceive (1) an ego-view image (accessible when necessary) and (2) the objects (including yourself) in the scene (always accessible). "\
+              "You should properly respond to the USER's instruction according to the given visual information. "
+EGOVIEW_PROMPT = "Ego-view image:"
+OBJECTS_PROMPT = "Objects (including you) in the scene:"
+TASK_PROMPT = "USER: {instruction} ASSISTANT:"
+OBJ_FEATS_DIR = 'assets/obj_features'
+def load_agent():
+    # build model
+    with open('model/cfg.yaml') as f:
+        cfg = yaml.safe_load(f)
+        cfg = OmegaConf.create(cfg)
+    agent = LeoAgentLLM(cfg)
+    # load checkpoint
+    if cfg.use_ckpt == 'hf':
+        ckpt_path = hf_hub_download(cfg.hf_ckpt_path[0], cfg.hf_ckpt_path[1])
+    else:
+        ckpt_path = cfg.local_ckpt_path
+    ckpt = torch.load(ckpt_path, map_location='cpu')
+    agent.load_state_dict(ckpt, strict=False)
+    agent.eval()
+    agent.to(DEVICE)
+    return agent
+agent = load_agent()
+def get_log_fname():
+    t = datetime.datetime.now()
+    fname = os.path.join(LOG_DIR, f'{t.year}-{t.month:02d}-{t.day:02d}.json')
+    return fname
+def change_scene(dropdown_scene: str):
+    # reset 3D scene and chatbot history
+    return os.path.join(MESH_DIR, f'{dropdown_scene}.glb'), None
+def receive_instruction(chatbot: gr.Chatbot, user_chat_input: gr.Textbox):
+    # display user input, after submitting user message, before inference
+    chatbot.append((user_chat_input, None))
+    return (chatbot, gr.update(value=""),) + (DISABLE_BUTTON,) * 5
+def generate_response(
+        chatbot: gr.Chatbot,
+        dropdown_scene: gr.Dropdown,
+        dropdown_conversation_mode: gr.Dropdown,
+        repetition_penalty: float, length_penalty: float
+    ):
+    # response starts
+    chatbot[-1] = (chatbot[-1][0], "▌")
+    yield (chatbot,) + (DISABLE_BUTTON,) * 5
+    # create data_dict, batch_size = 1
+    data_dict = {
+        'prompt_before_obj': [ROLE_PROMPT],
+        'prompt_middle_1': [EGOVIEW_PROMPT],
+        'prompt_middle_2': [OBJECTS_PROMPT],
+        'img_tokens': torch.zeros(1, 1, 4096).float(),
+        'img_masks': torch.zeros(1, 1).bool(),
+        'anchor_locs': torch.zeros(1, 3).float(),
+    }
+    # initialize prompt
+    prompt = ""
+    if 'Multi-round' in dropdown_conversation_mode:
+        # multi-round dialogue, with memory
+        for (q, a) in chatbot[:-1]:
+            prompt += f"USER: {q.strip()} ASSISTANT: {a.strip()}</s>"
+    prompt += f"USER: {chatbot[-1][0]} ASSISTANT:"
+    data_dict['prompt_after_obj'] = [prompt]
+    # anchor orientation
+    anchor_orient = torch.zeros(1, 4).float()
+    anchor_orient[:, -1] = 1
+    data_dict['anchor_orientation'] = anchor_orient
+    # load preprocessed scene features
+    data_dict.update(torch.load(os.path.join(OBJ_FEATS_DIR, f'{dropdown_scene}.pth'), map_location='cpu'))
+    # inference
+    for k, v in data_dict.items():
+        if isinstance(v, torch.Tensor):
+            data_dict[k] = v.to(DEVICE)
+    output = agent.generate(
+        data_dict,
+        repetition_penalty=float(repetition_penalty),
+        length_penalty=float(length_penalty),
+    )
+    output = output[0]
+    # display response
+    for out_len in range(1, len(output)-1):
+        chatbot[-1] = (chatbot[-1][0], output[:out_len] + '▌')
+        yield (chatbot,) + (DISABLE_BUTTON,) * 5
+        time.sleep(0.01)
+    chatbot[-1] = (chatbot[-1][0], output)
+    vote_response(chatbot, 'log', dropdown_scene, dropdown_conversation_mode)
+    yield (chatbot,) + (ENABLE_BUTTON,) * 5
+def vote_response(
+        chatbot: gr.Chatbot, vote_type: str,
+        dropdown_scene: gr.Dropdown,
+        dropdown_conversation_mode: gr.Dropdown
+    ):
+    t = datetime.datetime.now()
+    this_log = {
+        'time': f'{t.hour:02d}:{t.minute:02d}:{t.second:02d}',
+        'type': vote_type,
+        'scene': dropdown_scene,
+        'mode': dropdown_conversation_mode,
+        'dialogue': chatbot,
+    }
+    fname = get_log_fname()
+    if os.path.exists(fname):
+        with open(fname) as f:
+            logs = json.load(f)
+        logs.append(this_log)
+    else:
+        logs = [this_log]
+    with open(fname, 'w') as f:
+        json.dump(logs, f, indent=2)
+def upvote_response(
+        chatbot: gr.Chatbot,
+        dropdown_scene: gr.Dropdown,
+        dropdown_conversation_mode: gr.Dropdown
+    ):
+    vote_response(chatbot, 'upvote', dropdown_scene, dropdown_conversation_mode)
+    return ("",) + (DISABLE_BUTTON,) * 3
+def downvote_response(
+        chatbot: gr.Chatbot,
+        dropdown_scene: gr.Dropdown,
+        dropdown_conversation_mode: gr.Dropdown
+    ):
+    vote_response(chatbot, 'downvote', dropdown_scene, dropdown_conversation_mode)
+    return ("",) + (DISABLE_BUTTON,) * 3
+def flag_response(
+        chatbot: gr.Chatbot,
+        dropdown_scene: gr.Dropdown,
+        dropdown_conversation_mode: gr.Dropdown
+    ):
+    vote_response(chatbot, 'flag', dropdown_scene, dropdown_conversation_mode)
+    return ("",) + (DISABLE_BUTTON,) * 3
+def clear_history():
+    # reset chatbot history
+    return (None, "",) + (DISABLE_BUTTON,) * 4