diff --git a/minigemini/__init__.py b/minigemini/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea7a5d6557d31c51744711be27817833538d2344
--- /dev/null
+++ b/minigemini/__init__.py
@@ -0,0 +1,3 @@
+import timm
+import open_clip
+from .model import MiniGeminiLlamaForCausalLM
diff --git a/minigemini/constants.py b/minigemini/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..33342a4be53f3047656e60f01e031f6a71118b16
--- /dev/null
+++ b/minigemini/constants.py
@@ -0,0 +1,27 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+PREDICT_TOKEN_INDEX = -300
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+DEFAULT_PREDICT_TOKEN = "<predict>"
+
+DESCRIPT_PROMPT = [
+    "Describe this image thoroughly.",
+    "Provide a detailed description in this picture.",
+    "Detail every aspect of what's in this picture.",
+    "Explain this image with precision and detail.",
+    "Give a comprehensive description of this visual.",
+    "Elaborate on the specifics within this image.",
+    "Offer a detailed account of this picture's contents.",
+    "Describe in detail what this image portrays.",
+    "Break down this image into detailed descriptions.",
+    "Provide a thorough description of the elements in this image."]
\ No newline at end of file
diff --git a/minigemini/conversation.py b/minigemini/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..94fc610aba34d94eb02f046e4e5abe6b801921f8
--- /dev/null
+++ b/minigemini/conversation.py
@@ -0,0 +1,460 @@
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    GEMMA = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += "<start_of_turn>" + role + "\n" + message + "<end_of_turn>\n" + seps[i % 2]
+                else:
+                    ret += "<start_of_turn>" + role + "\n"
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                if type(msg) is tuple and len(msg) == 2:
+                    msg, img_b64_str = msg
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.strip() + img_str
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_vicuna_imgsp_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="imgsp_v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llava_plain_guided = Conversation(
+    system="",
+    roles=("", ""),
+    version="plain_guided",
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+
+conv_phi_2 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="phi2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<|endoftext|>",
+)
+
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+
+conv_gemma = Conversation(
+    system="",
+    roles=("user", "model"),
+    version="gemma",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.GEMMA,
+    sep="",
+    sep2="<eos>",
+)
+
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "phi_2": conv_phi_2,
+    "gemma": conv_gemma,
+    "llama_2": conv_llama_2,
+    "imgsp_v1": conv_vicuna_imgsp_v1,
+    "plain_guided": conv_llava_plain_guided,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+
+    "mpt": conv_mpt,
+}
+
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
\ No newline at end of file
diff --git a/minigemini/eval/MathVista/calculate_score.py b/minigemini/eval/MathVista/calculate_score.py
new file mode 100755
index 0000000000000000000000000000000000000000..c9e25c9cf5140cf551c643df586222b8e9c6c955
--- /dev/null
+++ b/minigemini/eval/MathVista/calculate_score.py
@@ -0,0 +1,258 @@
+import os
+import re
+import argparse
+import pandas as pd
+
+# !pip install python-Levenshtein
+from Levenshtein import distance
+
+import sys
+sys.path.append('../')
+from utilities import *
+
+
+def get_most_similar(prediction, choices):
+    """
+    Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
+    """
+    distances = [distance(prediction, choice) for choice in choices]
+    ind = distances.index(min(distances))
+    return choices[ind]
+    # return min(choices, key=lambda choice: distance(prediction, choice))
+
+
+def normalize_extracted_answer(extraction, choices, question_type, answer_type, precision):
+    """
+    Normalize the extracted answer to match the answer type
+    """
+    if question_type == 'multi_choice':
+        # make sure the extraction is a string
+        if isinstance(extraction, str):
+            extraction = extraction.strip()
+        else:
+            try:
+                extraction = str(extraction)
+            except:
+                extraction = ""
+    
+        # extract "A" from "(A) text"
+        letter = re.findall(r'\(([a-zA-Z])\)', extraction)
+        if len(letter) > 0:
+            extraction = letter[0].upper()
+        
+        options = [chr(ord('A') + i) for i in range(len(choices))]
+            
+        if extraction in options:
+            # convert option letter to text, e.g. "A" -> "text"
+            ind = options.index(extraction)
+            extraction = choices[ind]
+        else:
+            # select the most similar option
+            extraction = get_most_similar(extraction, choices)
+        assert extraction in choices
+
+    elif answer_type == 'integer':
+        try:
+            extraction = str(int(float(extraction)))
+        except:
+            extraction = None
+
+    elif answer_type == 'float':
+        try:
+            extraction = str(round(float(extraction), precision))
+        except:
+            extraction = None
+        
+    elif answer_type == 'list':
+        try:
+            extraction = str(extraction)
+        except:
+            extraction = None
+
+    return extraction
+    
+
+def safe_equal(prediction, answer):
+    """
+    Check if the prediction is equal to the answer, even if they are of different types
+    """
+    try:
+        if prediction == answer:
+            return True
+        return False
+    except Exception as e:
+        print(e)
+        return False
+
+
+def get_acc_with_contion(res_pd, key, value):
+    if key == 'skills':
+        # if value in res_pd[key]:
+        total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
+    else:
+        total_pd = res_pd[res_pd[key] == value]
+
+    correct_pd = total_pd[total_pd['true_false'] == True]
+    acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
+    return len(correct_pd), len(total_pd), acc
+        
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_file', type=str, default='output.json')
+    parser.add_argument('--score_file', type=str, default='scores.json')
+    parser.add_argument('--gt_file', type=str, default='../data/testmini.json', help='ground truth file')
+    parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
+    parser.add_argument('--rerun', action='store_true', help='rerun the evaluation')
+    parser.add_argument('--caculate_gain', action='store_true', help='caculate the socre gains over random guess')
+    parser.add_argument('--random_file', type=str, default='score_random_guess.json')  
+    args = parser.parse_args()
+
+    # args
+    output_file = args.output_file
+
+    # # quick test
+    # output_file = '../results/llava-llama-2-13b/output_llava_llama_2_13b.json'
+
+    # read json
+    print(f"Reading {output_file}...")
+    results = read_json(output_file)
+
+    # read ground truth
+    print(f"Reading {args.gt_file}...")
+    gts = read_json(args.gt_file)
+
+    # full pids
+    full_pids = list(results.keys())
+    if args.number > 0:
+        full_pids = full_pids[:min(args.number, len(full_pids))]
+    print("Number of testing problems:", len(full_pids))
+    
+    ## [1] Evaluate if the prediction is true or false
+    print("\nEvaluating the predictions...")
+    update_json_flag = False
+    for pid in full_pids:
+        problem = results[pid]
+        # print(problem)
+
+        if args.rerun:
+            if 'prediction' in problem:
+                del problem['prediction']
+            if 'true_false' in problem:
+                del problem['true_false']
+
+        choices = problem['choices']
+        question_type = problem['question_type']
+        answer_type = problem['answer_type']
+        precision = problem['precision']
+        extraction = problem['extraction']
+
+        if 'answer' in problem:
+            answer = problem['answer']
+        else:
+            answer = gts[pid]['answer']
+            problem['answer'] = answer
+
+        # normalize the extracted answer to match the answer type
+        prediction = normalize_extracted_answer(extraction, choices, question_type, answer_type, precision)
+
+        # verify the prediction is true or false
+        true_false = safe_equal(prediction, answer)
+        
+        # update the problem
+        if "true_false" not in problem:
+            update_json_flag = True
+
+        elif true_false != problem['true_false']:
+            update_json_flag = True
+
+        if "prediction" not in problem:
+            update_json_flag = True
+
+        elif prediction !=  problem['prediction']:
+            update_json_flag = True
+            
+        problem['prediction'] = prediction
+        problem['true_false'] = true_false
+
+    # save the updated json
+    if update_json_flag:
+        print("\n!!!Some problems are updated.!!!")
+        print(f"\nSaving {output_file}...")
+        save_json(results, output_file)
+
+    ## [2] Calculate the average accuracy
+    total = len(full_pids)
+    correct = 0
+    for pid in full_pids:
+        if results[pid]['true_false']:
+            correct += 1
+    accuracy = str(round(correct / total * 100, 2))
+    print(f"\nCorrect: {correct}, Total: {total}, Accuracy: {accuracy}%")
+
+    scores = {"average": {"accuracy": accuracy, "correct": correct, "total": total}}
+    
+    ## [3] Calculate the fine-grained accuracy scores
+    
+    # merge the 'metadata' attribute into the data
+    for pid in results:
+        results[pid].update(results[pid].pop('metadata'))
+
+    # convert the data to a pandas DataFrame
+    df = pd.DataFrame(results).T
+
+    print(len(df))
+    print("Number of test problems:", len(df))
+    # assert len(df) == 1000 # Important!!!
+
+    # asign the target keys for evaluation
+    target_keys = ['question_type', 'answer_type', 'language', 'source', 'category', 'task', 'context', 'grade', 'skills']
+     
+    for key in target_keys:
+        print(f"\nType: [{key}]")
+        # get the unique values of the key
+        if key == 'skills':
+            # the value is a list
+            values = []
+            for i in range(len(df)):
+                values += df[key][i]
+            values = list(set(values))
+        else:
+            values = df[key].unique()
+        #print(values)
+
+        # calculate the accuracy for each value
+        scores[key] = {}
+        for value in values:
+            correct, total, acc = get_acc_with_contion(df, key, value)
+            if total > 0:
+                print(f"[{value}]: {acc}% ({correct}/{total})")
+                scores[key][value] = {"accuracy": acc, "correct": correct, "total": total}
+        
+        # sort the scores by accuracy
+        scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
+
+    # save the scores
+    scores_file = args.score_file
+    print(f"\nSaving {scores_file}...")
+    save_json(scores, scores_file)
+    print("\nDone!")
+
+    # [4] Calculate the score gains over random guess
+    if args.caculate_gain:
+        random_file = args.random_file
+        random_scores = json.load(open(random_file))
+
+        print("\nCalculating the score gains...")
+        for key in scores:
+            if key == 'average':
+                gain = round(float(scores[key]['accuracy']) - float(random_scores[key]['accuracy']), 2)
+                scores[key]['acc_gain'] = gain
+            else:
+                for sub_key in scores[key]:
+                    gain = round(float(scores[key][sub_key]['accuracy']) - float(random_scores[key][sub_key]['accuracy']), 2)
+                    scores[key][sub_key]['acc_gain'] = str(gain)
+
+        # save the score gains
+        print(f"\nSaving {scores_file}...")    
+        save_json(scores, scores_file)
+        print("\nDone!")
diff --git a/minigemini/eval/MathVista/extract_answer.py b/minigemini/eval/MathVista/extract_answer.py
new file mode 100755
index 0000000000000000000000000000000000000000..afbc02f93397f918ddcdce43a0b349ab5851f032
--- /dev/null
+++ b/minigemini/eval/MathVista/extract_answer.py
@@ -0,0 +1,160 @@
+import os
+import re
+import time
+import argparse
+
+from tqdm import tqdm
+
+import sys
+sys.path.append('../')
+from utilities import *
+
+# OpenAI
+import openai
+
+# load demo prompt
+from prompts.ext_ans import demo_prompt
+
+
+def verify_extraction(extraction):
+    extraction = extraction.strip()
+    if extraction == "" or extraction == None:
+        return False
+    return True
+
+
+def create_test_prompt(demo_prompt, query, response):
+    demo_prompt = demo_prompt.strip()
+    test_prompt = f"{query}\n\n{response}"
+    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
+    return full_prompt
+
+
+def extract_answer(response, problem, quick_extract=False):
+    question_type = problem['question_type']
+    answer_type = problem['answer_type']
+    choices = problem['choices']
+    query = problem['query']
+    pid = problem['pid']
+
+    if response == "":
+        return ""
+    
+    if question_type == 'multi_choice' and response in choices:
+        return response
+    
+    if answer_type == "integer":
+        try:
+            extraction = int(response)
+            return str(extraction)
+        except:
+            pass
+
+    if answer_type == "float":
+        try:
+            extraction = str(float(response))
+            return extraction
+        except:
+            pass
+
+    # quick extraction
+    if quick_extract:
+        print("Quickly extracting answer...")
+        # The answer is "text". -> "text"
+        try:
+            result = re.search(r'The answer is "(.*)"\.', response)
+            if result:
+                extraction = result.group(1)
+                return extraction
+        except:
+            pass
+
+    # general extraction
+    try:
+        full_prompt = create_test_prompt(demo_prompt, query, response)
+        extraction = get_chat_response(full_prompt, openai.api_key, openai.api_base, model=args.llm_engine)
+        return extraction
+    except Exception as e:
+        print(e)
+        print(f"Error in extracting answer for {pid}")
+
+    return ""
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # input
+    parser.add_argument('--output_file', type=str, default='answer.json')
+    parser.add_argument('--response_label', type=str, default='response', help='response label for the input file')
+    # model
+    parser.add_argument('--llm_engine', type=str, default='gpt-4-0613', help='llm engine',
+                        choices = ['gpt-3.5-turbo', 'gpt-3.5', 'gpt-4', 'gpt-4-0314', 'gpt-4-0613'])
+    parser.add_argument('--number', type=int, default=-1, help='number of problems to run')
+    parser.add_argument('--quick_extract', action='store_true', help='use rules to extract answer for some problems')
+    parser.add_argument('--rerun', action='store_true', help='rerun the answer extraction')
+    # openai
+    parser.add_argument("--api_key", required=True, type=str, help="OpenAI API key")
+    parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base")
+    # output
+    parser.add_argument('--save_every', type=int, default=10, help='save every n problems')
+    parser.add_argument('--output_label', type=str, default='', help='label for the output file')
+    args = parser.parse_args()
+
+    # args
+    label = args.response_label
+    result_file = args.output_file
+    if args.output_label != '':
+        output_file = result_file.replace('.json', f'_{args.output_label}.json')
+    else:
+        output_file = result_file
+
+    # read results
+    print(f"Reading {result_file}...")
+    try:
+        results = read_json(output_file)
+    except:
+        samples = [json.loads(line) for line in open(result_file)]
+        results = {}
+        for sample in samples:
+            results[sample['pid']] = sample
+
+    # full pids
+    full_pids = list(results.keys())
+    if args.number > 0:
+        full_pids = full_pids[:min(args.number, len(full_pids))]
+    print("Number of testing problems:", len(full_pids))
+
+    # test pids
+    if args.rerun:
+        test_pids = full_pids
+    else:
+        test_pids = []
+        for pid in full_pids:
+            # print(pid)
+            if 'extraction' not in results[pid] or not verify_extraction(results[pid]['extraction']):
+                test_pids.append(pid)
+    
+    test_num = len(test_pids)
+    print("Number of problems to run:", test_num)
+    # print(test_pids)
+
+    # openai api
+    openai.api_key = args.api_key # Your API key here
+    if args.api_base:
+        openai.api_base = args.api_base # Your API base here
+
+    # tqdm, enumerate results
+    for i, pid in enumerate(tqdm(test_pids)):
+        problem = results[pid]
+
+        assert label in problem
+        response = problem[label]       
+
+        
+        extraction  = extract_answer(response, problem, args.quick_extract)
+        results[pid]['extraction'] = extraction
+
+        if i % args.save_every == 0 or i == test_num - 1:
+            print(f"Saving results to {output_file}...")
+            save_json(results, output_file)
+            print(f"Results saved.")
diff --git a/minigemini/eval/MathVista/prompts/ext_ans.py b/minigemini/eval/MathVista/prompts/ext_ans.py
new file mode 100755
index 0000000000000000000000000000000000000000..964aedb4a449fced634b515cbcf81fb9ab8bf2fa
--- /dev/null
+++ b/minigemini/eval/MathVista/prompts/ext_ans.py
@@ -0,0 +1,42 @@
+
+
+# pids = 852,  104,  824,  506,  540
+
+demo_prompt = """
+Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
+
+Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
+Question: Which number is missing?
+
+Model response: The number missing in the sequence is 14.
+
+Extracted answer: 14
+
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
+Question: What is the fraction of females facing the camera?
+
+Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
+
+Extracted answer: 0.6
+
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
+Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
+
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
+
+Extracted answer: 1.45
+
+Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
+Question: Between which two years does the line  graph saw its maximum peak?
+
+Model response: The line graph saw its maximum peak between 2007 and 2008.
+
+Extracted answer: [2007, 2008]
+
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
+Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
+
+Model response: The correct answer is (B) 8/11.
+
+Extracted answer: B
+"""
\ No newline at end of file
diff --git a/minigemini/eval/MathVista/utilities.py b/minigemini/eval/MathVista/utilities.py
new file mode 100755
index 0000000000000000000000000000000000000000..2290316e5bea5a07dad472f8be348bd78aa1b5da
--- /dev/null
+++ b/minigemini/eval/MathVista/utilities.py
@@ -0,0 +1,200 @@
+import os
+import cv2
+import json
+import time
+import pickle
+import openai
+import re
+from word2number import w2n
+
+
+def create_dir(output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+
+def read_csv(file):
+    data = []
+    with open(file, 'r') as f:
+        for line in f:
+            data.append(line.strip())
+    return data
+
+
+def read_pandas_csv(csv_path):
+    # read a pandas csv sheet 
+    import pandas as pd
+    df = pd.read_csv(csv_path)
+    return df
+
+
+def read_json(path):
+    with open(path, 'r', encoding='utf-8') as f:
+        return json.load(f)
+
+
+def read_jsonl(file):
+    with open(file, 'r') as f:
+        data = [json.loads(line) for line in f]
+    return data
+
+
+def read_pickle(path):
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+
+def save_json(data, path):
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=4)
+
+
+def save_array_img(path, image):
+    cv2.imwrite(path, image)
+
+
+def contains_digit(text):
+    # check if text contains a digit
+    if any(char.isdigit() for char in text):
+        return True
+    return False  
+    
+def contains_number_word(text):
+    # check if text contains a number word
+    ignore_words = ["a", "an", "point"]
+    words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
+    for word in words:
+        if word in ignore_words:
+            continue
+        try:
+            w2n.word_to_num(word)
+            return True  # If the word can be converted to a number, return True
+        except ValueError:
+            continue  # If the word can't be converted to a number, continue with the next word
+    
+    # check if text contains a digit
+    if any(char.isdigit() for char in text):
+        return True
+
+    return False  # If none of the words could be converted to a number, return False
+
+
+def contains_quantity_word(text, special_keep_words=[]):
+    # check if text contains a quantity word
+    quantity_words = ["most", "least", "fewest"
+                      "more", "less", "fewer", 
+                      "largest", "smallest", "greatest", 
+                      "larger", "smaller", "greater", 
+                      "highest", "lowest", "higher", "lower",
+                      "increase", "decrease",
+                      "minimum", "maximum", "max", "min",
+                      "mean", "average", "median",
+                      "total", "sum", "add", "subtract",
+                      "difference", "quotient", "gap",
+                      "half", "double", "twice", "triple",
+                      "square", "cube", "root",
+                      "approximate", "approximation",
+                      "triangle", "rectangle", "circle", "square", "cube", "sphere", "cylinder", "cone", "pyramid",
+                      "multiply", "divide",
+                      "percentage", "percent", "ratio", "proportion", "fraction", "rate", 
+                    ]
+    
+    quantity_words += special_keep_words # dataset specific words
+    
+    words = re.findall(r'\b\w+\b', text)  # This regex pattern matches any word in the text
+    if any(word in quantity_words for word in words):
+        return True
+
+    return False  # If none of the words could be converted to a number, return False
+
+
+def is_bool_word(text):
+    if text in ["Yes", "No", "True", "False", 
+                "yes", "no", "true", "false", 
+                "YES", "NO", "TRUE", "FALSE"]:
+        return True
+    return False
+
+
+def is_digit_string(text):
+    # remove ".0000"
+    text = text.strip()
+    text = re.sub(r'\.0+$', '', text)
+    try:
+        int(text)
+        return True
+    except ValueError:
+        return False
+   
+    
+def is_float_string(text):
+    # text is a float string if it contains a "." and can be converted to a float
+    if "." in text:
+        try:
+            float(text)
+            return True
+        except ValueError:
+            return False
+    return False
+
+
+def copy_image(image_path, output_image_path):
+    from shutil import copyfile
+    copyfile(image_path, output_image_path)
+
+
+def copy_dir(src_dir, dst_dir):
+    from shutil import copytree
+    # copy the source directory to the target directory
+    copytree(src_dir, dst_dir)
+
+
+import PIL.Image as Image
+def get_image_size(img_path):
+    img = Image.open(img_path)
+    width, height = img.size
+    return width, height
+
+
+def get_chat_response(promot, api_key, api_base, model="gpt-3.5-turbo", temperature=0, max_tokens=256, n=1, patience=10000000,
+ sleep_time=0):
+    messages = [
+        {"role": "user", "content": promot},
+    ]
+    # print("I am here")
+    while patience > 0:
+        patience -= 1
+        try:
+            response = openai.ChatCompletion.create(model=model,
+                                                messages=messages,
+                                                api_key=api_key,
+                                                api_base=api_base,
+                                                temperature=temperature,
+                                                max_tokens=max_tokens,
+                                                n=n)
+            if n == 1:
+                prediction = response['choices'][0]['message']['content'].strip()
+                if prediction != "" and prediction != None:
+                    return prediction
+            else:
+                prediction = [choice['message']['content'].strip() for choice in response['choices']]
+                if prediction[0] != "" and prediction[0] != None:
+                    return prediction
+
+        except Exception as e:
+            if "Rate limit" not in str(e):
+                print(e)
+
+            if "Please reduce the length of the messages" in str(e):
+                print("!!Reduce promot size")
+                # reduce input prompt and keep the tail
+                new_size = int(len(promot) * 0.9)
+                new_start = len(promot) - new_size
+                promot = promot[new_start:]
+                messages = [
+                    {"role": "user", "content": promot},
+                ]
+                
+            if sleep_time > 0:
+                time.sleep(sleep_time)
+    return ""
\ No newline at end of file
diff --git a/minigemini/eval/eval_gpt_review.py b/minigemini/eval/eval_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..8af4559c65fc2728b11fd2097a109981ee1ef686
--- /dev/null
+++ b/minigemini/eval/eval_gpt_review.py
@@ -0,0 +1,113 @@
+import argparse
+import json
+import os
+
+import openai
+import tqdm
+import ray
+import time
+
+NUM_SECONDS_TO_SLEEP = 3
+
+@ray.remote(num_cpus=4)
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    print('success!')
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    # parser.add_argument('-a', '--answer')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    ray.init()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    review_file = open(f'{args.output}', 'w')
+
+    js_list = []
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        # if idx == 1:
+        #     break
+
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            rule = rule_dict['default']
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        js_list.append({
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1['answer_id'],
+            'answer2_id': ans2['answer_id'],
+            'category': category})
+        idx += 1
+        handles.append(get_eval.remote(content, args.max_tokens))
+        # To avoid the rate limit set by OpenAI
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    reviews = ray.get(handles)
+    for idx, review in enumerate(reviews):
+        scores = parse_score(review)
+        js_list[idx]['content'] = review
+        js_list[idx]['tuple'] = scores
+        review_file.write(json.dumps(js_list[idx]) + '\n')
+    review_file.close()
diff --git a/minigemini/eval/eval_gpt_review_bench.py b/minigemini/eval/eval_gpt_review_bench.py
new file mode 100644
index 0000000000000000000000000000000000000000..06160f2422b5368f30fb967f7cae635208a1dc69
--- /dev/null
+++ b/minigemini/eval/eval_gpt_review_bench.py
@@ -0,0 +1,121 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/minigemini/eval/eval_gpt_review_visual.py b/minigemini/eval/eval_gpt_review_visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e407a400a67020d801e6c27a3c32a2ee38f30c
--- /dev/null
+++ b/minigemini/eval/eval_gpt_review_visual.py
@@ -0,0 +1,118 @@
+import argparse
+import json
+import os
+
+import openai
+import time
+
+NUM_SECONDS_TO_SLEEP = 0.5
+
+
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            response = openai.ChatCompletion.create(
+                model='gpt-4-0314',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except openai.error.RateLimitError:
+            pass
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+
+    return response['choices'][0]['message']['content']
+
+
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+
+    review_file = open(f'{args.output}', 'a')
+
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+
+        inst = image_to_context[ques['image']]
+        cap_str = '\n'.join(inst['captions'])
+        box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
+
+        category = json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f"Visual QA category not found in rule file: {category}."
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx+1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans2['answer_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
diff --git a/minigemini/eval/eval_pope.py b/minigemini/eval/eval_pope.py
new file mode 100644
index 0000000000000000000000000000000000000000..b115b8f2327ea9d972f9e41bcbb03c68be6b3508
--- /dev/null
+++ b/minigemini/eval/eval_pope.py
@@ -0,0 +1,81 @@
+import os
+import json
+import argparse
+
+def eval_pope(answers, label_file):
+    label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
+
+    for answer in answers:
+        text = answer['text']
+
+        # Only keep the first sentence
+        if text.find('.') != -1:
+            text = text.split('.')[0]
+
+        text = text.replace(',', '')
+        words = text.split(' ')
+        if 'No' in words or 'not' in words or 'no' in words:
+            answer['text'] = 'no'
+        else:
+            answer['text'] = 'yes'
+
+    for i in range(len(label_list)):
+        if label_list[i] == 'no':
+            label_list[i] = 0
+        else:
+            label_list[i] = 1
+
+    pred_list = []
+    for answer in answers:
+        if answer['text'] == 'no':
+            pred_list.append(0)
+        else:
+            pred_list.append(1)
+
+    pos = 1
+    neg = 0
+    yes_ratio = pred_list.count(1) / len(pred_list)
+
+    TP, TN, FP, FN = 0, 0, 0, 0
+    for pred, label in zip(pred_list, label_list):
+        if pred == pos and label == pos:
+            TP += 1
+        elif pred == pos and label == neg:
+            FP += 1
+        elif pred == neg and label == neg:
+            TN += 1
+        elif pred == neg and label == pos:
+            FN += 1
+
+    print('TP\tFP\tTN\tFN\t')
+    print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
+
+    precision = float(TP) / float(TP + FP)
+    recall = float(TP) / float(TP + FN)
+    f1 = 2*precision*recall / (precision + recall)
+    acc = (TP + TN) / (TP + TN + FP + FN)
+    print('Accuracy: {}'.format(acc))
+    print('Precision: {}'.format(precision))
+    print('Recall: {}'.format(recall))
+    print('F1 score: {}'.format(f1))
+    print('Yes ratio: {}'.format(yes_ratio))
+    print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--annotation-dir", type=str)
+    parser.add_argument("--question-file", type=str)
+    parser.add_argument("--result-file", type=str)
+    args = parser.parse_args()
+
+    questions = [json.loads(line) for line in open(args.question_file)]
+    questions = {question['question_id']: question for question in questions}
+    answers = [json.loads(q) for q in open(args.result_file)]
+    for file in os.listdir(args.annotation_dir):
+        assert file.startswith('coco_pope_')
+        assert file.endswith('.json')
+        category = file[10:-5]
+        cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
+        print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
+        eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
+        print("====================================")
diff --git a/minigemini/eval/eval_science_qa.py b/minigemini/eval/eval_science_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccf206bbd7a5d6376eef82d61b3ef8bbe0f71c6c
--- /dev/null
+++ b/minigemini/eval/eval_science_qa.py
@@ -0,0 +1,114 @@
+import argparse
+import json
+import os
+import re
+import random
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--output-file', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return -1
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    predictions = [json.loads(line) for line in open(args.result_file)]
+    predictions = {pred['question_id']: pred for pred in predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    results = {'correct': [], 'incorrect': []}
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in predictions:
+            pred = {'text': 'FAILED', 'prompt': 'Unknown'}
+            pred_text = 'FAILED'
+        else:
+            pred = predictions[prob_id]
+            pred_text = pred['text']
+
+        if pred_text in args.options:
+            answer = pred_text
+        elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
+            answer = pred_text[0]
+        else:
+            pattern = re.compile(r'The answer is ([A-Z]).')
+            res = pattern.findall(pred_text)
+            if len(res) == 1:
+                answer = res[0]  # 'A', 'B', ...
+            else:
+                answer = "FAILED"
+
+        pred_idx = get_pred_idx(answer, prob['choices'], args.options)
+
+        analysis = {
+            'question_id': prob_id,
+            'parsed_ans': answer,
+            'ground_truth': args.options[prob['answer']],
+            'question': pred['prompt'],
+            'pred': pred_text,
+            'is_multimodal': '<image>' in pred['prompt'],
+        }
+
+        sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
+        sqa_results['outputs'][prob_id] = pred_text
+
+        if pred_idx == prob['answer']:
+            results['correct'].append(analysis)
+        else:
+            results['incorrect'].append(analysis)
+
+    correct = len(results['correct'])
+    total = len(results['correct']) + len(results['incorrect'])
+
+    ###### IMG ######
+    multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
+    multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
+    multimodal_total = multimodal_correct + multimodal_incorrect
+    ###### IMG ######
+
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
+
+    sqa_results['acc'] = correct / total * 100
+    sqa_results['correct'] = correct
+    sqa_results['count'] = total
+
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=2)
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
diff --git a/minigemini/eval/eval_science_qa_gpt4.py b/minigemini/eval/eval_science_qa_gpt4.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2ff17c915481fb556aba6ec816a9e08f519c515
--- /dev/null
+++ b/minigemini/eval/eval_science_qa_gpt4.py
@@ -0,0 +1,104 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            continue
+        if prob_id not in gpt4_predictions:
+            continue
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            # continue
+            gpt4_pred_idx = our_pred_idx
+            # if our_pred_idx != prob['answer']:
+            #     print(our_predictions[prob_id]['prompt'])
+            #     print('-----------------')
+            #     print(f'LECTURE: {prob["lecture"]}')
+            #     print(f'SOLUTION: {prob["solution"]}')
+            #     print('=====================')
+        else:
+            # continue
+            pass
+        # gpt4_pred_idx = our_pred_idx
+
+        if gpt4_pred_idx == prob['answer']:
+            results['correct'] += 1
+        else:
+            results['incorrect'] += 1
+
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    correct = results['correct']
+    total = results['correct'] + results['incorrect']
+    print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
+    print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+
diff --git a/minigemini/eval/eval_science_qa_gpt4_requery.py b/minigemini/eval/eval_science_qa_gpt4_requery.py
new file mode 100644
index 0000000000000000000000000000000000000000..698546e995d365d1ccc2c25a87e6c5cd681e6eb6
--- /dev/null
+++ b/minigemini/eval/eval_science_qa_gpt4_requery.py
@@ -0,0 +1,149 @@
+import argparse
+import json
+import os
+import re
+import random
+from collections import defaultdict
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--base-dir', type=str)
+    parser.add_argument('--gpt4-result', type=str)
+    parser.add_argument('--requery-result', type=str)
+    parser.add_argument('--our-result', type=str)
+    parser.add_argument('--output-result', type=str)
+    parser.add_argument('--split', type=str, default='test')
+    parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
+    return parser.parse_args()
+
+
+def convert_caps(results):
+    fakecaps = []
+    for result in results:
+        image_id = result['question_id']
+        caption = result['text']
+        fakecaps.append({"image_id": int(image_id), "caption": caption})
+    return fakecaps
+
+
+def get_pred_idx(prediction, choices, options):
+    """
+    Get the index (e.g. 2) from the prediction (e.g. 'C')
+    """
+    if prediction in options[:len(choices)]:
+        return options.index(prediction)
+    else:
+        return random.choice(range(len(choices)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    base_dir = args.base_dir
+    split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
+    problems = json.load(open(os.path.join(base_dir, "problems.json")))
+    our_predictions = [json.loads(line) for line in open(args.our_result)]
+    our_predictions = {pred['question_id']: pred for pred in our_predictions}
+    split_problems = {idx: problems[idx] for idx in split_indices}
+
+    requery_predictions = [json.loads(line) for line in open(args.requery_result)]
+    requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
+
+    gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
+
+    results = defaultdict(lambda: 0)
+
+    sqa_results = {}
+    sqa_results['acc'] = None
+    sqa_results['correct'] = None
+    sqa_results['count'] = None
+    sqa_results['results'] = {}
+    sqa_results['outputs'] = {}
+
+    for prob_id, prob in split_problems.items():
+        if prob_id not in our_predictions:
+            assert False
+        if prob_id not in gpt4_predictions:
+            assert False
+        our_pred = our_predictions[prob_id]['text']
+        gpt4_pred = gpt4_predictions[prob_id]
+        if prob_id not in requery_predictions:
+            results['missing_requery'] += 1
+            requery_pred = "MISSING"
+        else:
+            requery_pred = requery_predictions[prob_id]['text']
+
+        pattern = re.compile(r'The answer is ([A-Z]).')
+        our_res = pattern.findall(our_pred)
+        if len(our_res) == 1:
+            our_answer = our_res[0]  # 'A', 'B', ...
+        else:
+            our_answer = "FAILED"
+
+        requery_res = pattern.findall(requery_pred)
+        if len(requery_res) == 1:
+            requery_answer = requery_res[0]  # 'A', 'B', ...
+        else:
+            requery_answer = "FAILED"
+
+        gpt4_res = pattern.findall(gpt4_pred)
+        if len(gpt4_res) == 1:
+            gpt4_answer = gpt4_res[0]  # 'A', 'B', ...
+        else:
+            gpt4_answer = "FAILED"
+
+        our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
+        gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
+        requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
+
+        results['total'] += 1
+
+        if gpt4_answer == 'FAILED':
+            results['gpt4_failed'] += 1
+            if gpt4_pred_idx == prob['answer']:
+                results['gpt4_correct'] += 1
+            if our_pred_idx == prob['answer']:
+                results['gpt4_ourvisual_correct'] += 1
+        elif gpt4_pred_idx == prob['answer']:
+            results['gpt4_correct'] += 1
+            results['gpt4_ourvisual_correct'] += 1
+
+        if our_pred_idx == prob['answer']:
+            results['our_correct'] += 1
+
+        if requery_answer == 'FAILED':
+            sqa_results['results'][prob_id] = our_pred_idx
+            if our_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+        else:
+            sqa_results['results'][prob_id] = requery_pred_idx
+            if requery_pred_idx == prob['answer']:
+                results['requery_correct'] += 1
+            else:
+                print(f"""
+Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
+Our ({our_answer}): {our_pred}
+GPT-4 ({gpt4_answer}): {gpt4_pred}
+Requery ({requery_answer}): {requery_pred}
+print("=====================================")
+""")
+
+        if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
+            results['correct_upperbound'] += 1
+
+    total = results['total']
+    print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
+    print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
+    print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
+
+    sqa_results['acc'] = results["requery_correct"] / total * 100
+    sqa_results['correct'] = results["requery_correct"]
+    sqa_results['count'] = total
+
+    with open(args.output_result, 'w') as f:
+        json.dump(sqa_results, f, indent=2)
+
diff --git a/minigemini/eval/eval_textvqa.py b/minigemini/eval/eval_textvqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..09ca9a583e8b21aa44025b3d56004ec31013e4b2
--- /dev/null
+++ b/minigemini/eval/eval_textvqa.py
@@ -0,0 +1,65 @@
+import os
+import argparse
+import json
+import re
+
+from minigemini.eval.m4c_evaluator import TextVQAAccuracyEvaluator
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--annotation-file', type=str)
+    parser.add_argument('--result-file', type=str)
+    parser.add_argument('--result-dir', type=str)
+    return parser.parse_args()
+
+
+def prompt_processor(prompt):
+    if prompt.startswith('OCR tokens: '):
+        pattern = r"Question: (.*?) Short answer:"
+        match = re.search(pattern, prompt, re.DOTALL)
+        question = match.group(1)
+    elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
+        if prompt.startswith('Reference OCR token:'):
+            question = prompt.split('\n')[1]
+        else:
+            question = prompt.split('\n')[0]
+    elif len(prompt.split('\n')) == 2:
+        question = prompt.split('\n')[0]
+    else:
+        assert False
+
+    return question.lower()
+
+
+def eval_single(annotation_file, result_file):
+    experiment_name = os.path.splitext(os.path.basename(result_file))[0]
+    print(experiment_name)
+    annotations = json.load(open(annotation_file))['data']
+    annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
+    results = [json.loads(line) for line in open(result_file)]
+
+    pred_list = []
+    for result in results:
+        annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
+        pred_list.append({
+            "pred_answer": result['text'],
+            "gt_answers": annotation['answers'],
+        })
+
+    evaluator = TextVQAAccuracyEvaluator()
+    print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    if args.result_file is not None:
+        eval_single(args.annotation_file, args.result_file)
+
+    if args.result_dir is not None:
+        for result_file in sorted(os.listdir(args.result_dir)):
+            if not result_file.endswith('.jsonl'):
+                print(f'Skipping {result_file}')
+                continue
+            eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
diff --git a/minigemini/eval/generate_webpage_data_from_table.py b/minigemini/eval/generate_webpage_data_from_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..92602258ccd953a1d7137056aaf15c8de8166e21
--- /dev/null
+++ b/minigemini/eval/generate_webpage_data_from_table.py
@@ -0,0 +1,111 @@
+"""Generate json file for webpage."""
+import json
+import os
+import re
+
+# models = ['llama', 'alpaca', 'gpt35', 'bard']
+models = ['vicuna']
+
+
+def read_jsonl(path: str, key: str=None):
+    data = []
+    with open(os.path.expanduser(path)) as f:
+        for line in f:
+            if not line:
+                continue
+            data.append(json.loads(line))
+    if key is not None:
+        data.sort(key=lambda x: x[key])
+        data = {item[key]: item for item in data}
+    return data
+
+
+def trim_hanging_lines(s: str, n: int) -> str:
+    s = s.strip()
+    for _ in range(n):
+        s = s.split('\n', 1)[1].strip()
+    return s
+
+
+if __name__ == '__main__':
+    questions = read_jsonl('table/question.jsonl', key='question_id')
+
+    # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
+    # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
+    # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
+    # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
+    vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
+    ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
+
+    review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
+    # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
+    # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
+    # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
+    # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
+
+    records = []
+    for qid in questions.keys():
+        r = {
+            'id': qid,
+            'category': questions[qid]['category'],
+            'question': questions[qid]['text'],
+            'answers': {
+                # 'alpaca': alpaca_answers[qid]['text'],
+                # 'llama': llama_answers[qid]['text'],
+                # 'bard': bard_answers[qid]['text'],
+                # 'gpt35': gpt35_answers[qid]['text'],
+                'vicuna': vicuna_answers[qid]['text'],
+                'ours': ours_answers[qid]['text'],
+            },
+            'evaluations': {
+                # 'alpaca': review_alpaca[qid]['text'],
+                # 'llama': review_llama[qid]['text'],
+                # 'bard': review_bard[qid]['text'],
+                'vicuna': review_vicuna[qid]['content'],
+                # 'gpt35': review_gpt35[qid]['text'],
+            },
+            'scores': {
+                'vicuna': review_vicuna[qid]['tuple'],
+                # 'alpaca': review_alpaca[qid]['score'],
+                # 'llama': review_llama[qid]['score'],
+                # 'bard': review_bard[qid]['score'],
+                # 'gpt35': review_gpt35[qid]['score'],
+            },
+        }
+
+        # cleanup data
+        cleaned_evals = {}
+        for k, v in r['evaluations'].items():
+            v = v.strip()
+            lines = v.split('\n')
+            # trim the first line if it's a pair of numbers
+            if re.match(r'\d+[, ]+\d+', lines[0]):
+                lines = lines[1:]
+            v = '\n'.join(lines)
+            cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
+
+        r['evaluations'] = cleaned_evals
+        records.append(r)
+
+    # Reorder the records, this is optional
+    for r in records:
+        if r['id'] <= 20:
+            r['id'] += 60
+        else:
+            r['id'] -= 20
+    for r in records:
+        if r['id'] <= 50:
+            r['id'] += 10
+        elif 50 < r['id'] <= 60:
+            r['id'] -= 50
+    for r in records:
+        if r['id'] == 7:
+            r['id'] = 1
+        elif r['id'] < 7:
+            r['id'] += 1 
+
+    records.sort(key=lambda x: x['id'])
+
+    # Write to file
+    with open('webpage/data.json', 'w') as f:
+        json.dump({'questions': records, 'models': models}, f, indent=2)
diff --git a/minigemini/eval/m4c_evaluator.py b/minigemini/eval/m4c_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e30e958da061a4f0a0bfe34b12d2fcaeba7ff2f4
--- /dev/null
+++ b/minigemini/eval/m4c_evaluator.py
@@ -0,0 +1,334 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import re
+
+from tqdm import tqdm
+
+
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (
+                re.search(self.COMMA_STRIP, in_text) is not None
+            ):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item
+
+
+class TextVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def _compute_answer_scores(self, raw_answers):
+        """
+        compute the accuracy (soft score) of human answers
+        """
+        answers = [self.answer_processor(a) for a in raw_answers]
+        assert len(answers) == 10
+        gt_answers = list(enumerate(answers))
+        unique_answers = set(answers)
+        unique_answer_scores = {}
+
+        for unique_answer in unique_answers:
+            accs = []
+            for gt_answer in gt_answers:
+                other_answers = [item for item in gt_answers if item != gt_answer]
+                matching_answers = [
+                    item for item in other_answers if item[1] == unique_answer
+                ]
+                acc = min(1, float(len(matching_answers)) / 3)
+                accs.append(acc)
+            unique_answer_scores[unique_answer] = sum(accs) / len(accs)
+
+        return unique_answer_scores
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in tqdm(pred_list):
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
+            score = unique_answer_scores.get(pred_answer, 0.0)
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAAccuracyEvaluator:
+    def __init__(self):
+        self.answer_processor = EvalAIAnswerProcessor()
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            pred_answer = self.answer_processor(entry["pred_answer"])
+            gts = [self.answer_processor(a) for a in entry["gt_answers"]]
+            score = 1.0 if pred_answer in gts else 0.0
+            pred_scores.append(score)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class STVQAANLSEvaluator:
+    def __init__(self):
+        import editdistance  # install with `pip install editdistance`
+
+        self.get_edit_distance = editdistance.eval
+
+    def get_anls(self, s1, s2):
+        s1 = s1.lower().strip()
+        s2 = s2.lower().strip()
+        iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
+        anls = iou if iou >= 0.5 else 0.0
+        return anls
+
+    def eval_pred_list(self, pred_list):
+        pred_scores = []
+        for entry in pred_list:
+            anls = max(
+                self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
+            )
+            pred_scores.append(anls)
+
+        accuracy = sum(pred_scores) / len(pred_scores)
+        return accuracy
+
+
+class TextCapsBleu4Evaluator:
+    def __init__(self):
+        # The following script requires Java 1.8.0 and pycocotools installed.
+        # The pycocoevalcap can be installed with pip as
+        # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
+        # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
+        # but has no python3 support yet.
+        try:
+            from pycocoevalcap.bleu.bleu import Bleu
+            from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
+        except ModuleNotFoundError:
+            print(
+                "Please install pycocoevalcap module using "
+                "pip install git+https://github.com/ronghanghu/coco-caption.git@python23"  # noqa
+            )
+            raise
+
+        self.tokenizer = PTBTokenizer()
+        self.scorer = Bleu(4)
+
+    def eval_pred_list(self, pred_list):
+        # Create reference and hypotheses captions.
+        gts = {}
+        res = {}
+        for idx, entry in enumerate(pred_list):
+            gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
+            res[idx] = [{"caption": entry["pred_answer"]}]
+
+        gts = self.tokenizer.tokenize(gts)
+        res = self.tokenizer.tokenize(res)
+        score, _ = self.scorer.compute_score(gts, res)
+
+        bleu4 = score[3]  # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
+        return bleu4
diff --git a/minigemini/eval/model_math_vista.py b/minigemini/eval/model_math_vista.py
new file mode 100755
index 0000000000000000000000000000000000000000..e09526e7a85e0ade23b8a83a8de6a77e87e9111d
--- /dev/null
+++ b/minigemini/eval/model_math_vista.py
@@ -0,0 +1,237 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def create_one_query(problem, shot_num, shot_type, use_caption):
+
+
+    ### [1] Demo prompt
+    demo_prompt = ""
+
+    ### [2] Test query
+    # problem info
+    question = problem['question']
+    unit = problem['unit']
+    choices = problem['choices']
+    # caption = problem['caption']
+    precision = problem['precision']
+    question_type = problem['question_type']
+    answer_type = problem['answer_type']
+
+    # hint
+    if shot_type == 'solution':
+        if question_type == "multi_choice":
+            assert answer_type == "text"
+            hint_text = f"Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end."
+        else:
+            assert answer_type in ["integer", "float", "list"]
+            if answer_type == "integer":
+                hint_text = f"Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end."
+            
+            elif answer_type == "float" and precision == 1:
+                hint_text = f"Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end."
+            
+            elif answer_type == "float" and precision == 2:
+                hint_text = f"Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end."
+            
+            elif answer_type == "list":
+                hint_text = f"Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end."
+    else:
+        assert shot_type == 'code'
+        hint_text = "Hint: Please generate a python code to solve the problem"
+
+    # question
+    question_text = f"Question: {question}"
+    if unit:
+        question_text += f" (Unit: {unit})"
+
+    # choices
+    if choices:
+        # choices: (A) 1.2 (B) 1.3 (C) 1.4 (D) 1.5
+        texts = ["Choices:"]
+        for i, choice in enumerate(choices):
+            texts.append(f"({chr(ord('A')+i)}) {choice}")
+        choices_text = "\n".join(texts)
+    else:
+        choices_text = ""
+
+    # prompt
+    if shot_type == 'solution':
+        prompt = "Solution: "
+    else:
+        assert shot_type == 'code'
+        prompt = "Python code: "
+    
+    elements = [hint_text, question_text, choices_text]
+    test_query = "\n".join([e for e in elements if e != ""])
+
+    ### [3] Final query
+    query = demo_prompt + "\n\n" + test_query
+    query = query.strip()
+    return query
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name,
+                                                                           load_8bit=args.load_8bit)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = [dict(pid=pid, info=qs) for pid, qs in questions.items()]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+
+    if os.path.exists(answers_file):
+        file = open(answers_file, "r")
+        pred_contents = [json.loads(line) for line in file]
+        done_pid = [sample['pid'] for sample in pred_contents]
+    else:
+        done_pid = []
+    ans_file = open(answers_file, "a")
+
+    for i, line in enumerate(tqdm(questions)):
+        idx = line['pid']
+        info = line['info']
+        if idx in done_pid:
+            continue
+
+        qs = create_one_query(
+            problem = info, 
+            shot_num = 0,
+            shot_type = 'solution',
+            use_caption = False,
+        )
+        query = qs
+
+        if 'image' in info:
+            image_file = info["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+            
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+        else:
+            images = None
+            images_aux = None
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        info['query'] = query
+        info['response'] = outputs
+        ans_file.write(json.dumps(info) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument('--load_8bit', type=bool, default=False)
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
diff --git a/minigemini/eval/model_qa.py b/minigemini/eval/model_qa.py
new file mode 100644
index 0000000000000000000000000000000000000000..55f23363405597e06e19677b581c753ad9891728
--- /dev/null
+++ b/minigemini/eval/model_qa.py
@@ -0,0 +1,64 @@
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from minigemini.conversation import default_conversation
+from minigemini.utils import disable_torch_init
+
+
+@torch.inference_mode()
+def eval_model(model_name, questions_file, answers_file):
+    # Model
+    disable_torch_init()
+    model_name = os.path.expanduser(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(model_name,
+        torch_dtype=torch.float16).cuda()
+
+
+    ques_file = open(os.path.expanduser(questions_file), "r")
+    ans_file = open(os.path.expanduser(answers_file), "w")
+    for i, line in enumerate(tqdm(ques_file)):
+        idx = json.loads(line)["question_id"]
+        qs = json.loads(line)["text"]
+        cat = json.loads(line)["category"]
+        conv = default_conversation.copy()
+        conv.append_message(conv.roles[0], qs)
+        prompt = conv.get_prompt()
+        inputs = tokenizer([prompt])
+        input_ids = torch.as_tensor(inputs.input_ids).cuda()
+        output_ids = model.generate(
+            input_ids,
+            do_sample=True,
+            use_cache=True,
+            temperature=0.7,
+            max_new_tokens=1024,)
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+        try:
+            index = outputs.index(conv.sep, len(prompt))
+        except ValueError:
+            outputs += conv.sep
+            index = outputs.index(conv.sep, len(prompt))
+
+        outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    args = parser.parse_args()
+
+    eval_model(args.model_name, args.question_file, args.answers_file)
\ No newline at end of file
diff --git a/minigemini/eval/model_vqa.py b/minigemini/eval/model_vqa.py
new file mode 100644
index 0000000000000000000000000000000000000000..32782f369ebc213ca42be1b5b10a3ba74e4fd127
--- /dev/null
+++ b/minigemini/eval/model_vqa.py
@@ -0,0 +1,154 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line["image"]
+        qs = line["text"]
+        cur_prompt = qs
+        
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+        
+        if hasattr(model.config, 'image_size_aux'):
+            if not hasattr(image_processor, 'image_size_raw'):
+                image_processor.image_size_raw = image_processor.crop_size.copy()
+            image_processor.crop_size['height'] = model.config.image_size_aux
+            image_processor.crop_size['width'] = model.config.image_size_aux
+            image_processor.size['shortest_edge'] = model.config.image_size_aux
+        
+        image_tensor = process_images([image], image_processor, model.config)[0]
+
+        image_grid = getattr(model.config, 'image_grid', 1)
+        if hasattr(model.config, 'image_size_aux'):
+            raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                        image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                        size=raw_shape, 
+                                                        mode='bilinear', 
+                                                        align_corners=False)[0]
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                            image_grid,
+                                            image_processor.image_size_raw['height'],
+                                            image_grid,
+                                            image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                        image_processor.image_size_raw['height'],
+                                        image_processor.image_size_raw['width'])
+            
+            if getattr(model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                        size=[image_processor.image_size_raw['height'],
+                                                            image_processor.image_size_raw['width']], 
+                                                        mode='bilinear', 
+                                                        align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+
+        images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+        images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
diff --git a/minigemini/eval/model_vqa_loader.py b/minigemini/eval/model_vqa_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3895f594a2275ec927e001cfe9f6069dfe11af6b
--- /dev/null
+++ b/minigemini/eval/model_vqa_loader.py
@@ -0,0 +1,187 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from torch.utils.data import Dataset, DataLoader
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+
+    def __getitem__(self, index):
+        line = self.questions[index]
+        image_file = line["image"]
+        qs = line["text"]
+        
+        if self.model_config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
+                
+        if hasattr(self.model_config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model_config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model_config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model_config.image_size_aux
+        
+        image_tensor = process_images([image], self.image_processor, self.model_config)[0]
+
+        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
+        
+        image_grid = getattr(self.model_config, 'image_grid', 1)
+        if hasattr(self.model_config, 'image_size_aux'):
+            raw_shape = [self.image_processor.image_size_raw['height'] * image_grid, 
+                         self.image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor
+            image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                           size=raw_shape, 
+                                                           mode='bilinear', 
+                                                           align_corners=False)[0]
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                             image_grid,
+                                             self.image_processor.image_size_raw['height'],
+                                             image_grid,
+                                             self.image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                          self.image_processor.image_size_raw['height'],
+                                          self.image_processor.image_size_raw['width'])
+            
+            if getattr(self.model_config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                        size=[self.image_processor.image_size_raw['height'],
+                                                              self.image_processor.image_size_raw['width']], 
+                                                        mode='bilinear', 
+                                                        align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+
+        return input_ids, image_tensor, image_tensor_aux
+    
+    def __len__(self):
+        return len(self.questions)
+
+
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False)
+    return data_loader
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name, load_8bit=args.load_8bit)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in args.conv_mode and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
+
+    for (input_ids, image_tensor, image_tensor_aux), line in tqdm(zip(data_loader, questions), total=len(questions)):
+        idx = line["question_id"]
+        cur_prompt = line["text"]
+        
+        input_ids = input_ids.to(device=model.device, non_blocking=True)
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor.to(dtype=model.dtype, device=model.device, non_blocking=True),
+                images_aux=image_tensor_aux.to(dtype=model.dtype, device=model.device, non_blocking=True) if len(image_tensor_aux)>0 else None,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                top_p=args.top_p,
+                num_beams=args.num_beams,
+                max_new_tokens=args.max_new_tokens,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        # ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument('--load_8bit', type=bool, default=False)
+    parser.add_argument("--max_new_tokens", type=int, default=128)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/minigemini/eval/model_vqa_mmbench.py b/minigemini/eval/model_vqa_mmbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1e6d31c90628c59991070d4bd2930269c4a9ef2
--- /dev/null
+++ b/minigemini/eval/model_vqa_mmbench.py
@@ -0,0 +1,212 @@
+import argparse
+import torch
+import os
+import json
+import pandas as pd
+from tqdm import tqdm
+import shortuuid
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
+
+from PIL import Image
+import math
+
+
+all_options = ['A', 'B', 'C', 'D']
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def is_none(value):
+    if value is None:
+        return True
+    if type(value) is float and math.isnan(value):
+        return True
+    if type(value) is str and value.lower() == 'nan':
+        return True
+    if type(value) is str and value.lower() == 'none':
+        return True
+    return False
+
+def get_options(row, options):
+    parsed_options = []
+    for option in options:
+        option_value = row[option]
+        if is_none(option_value):
+            break
+        parsed_options.append(option_value)
+    return parsed_options
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = pd.read_table(os.path.expanduser(args.question_file))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
+        args.conv_mode = args.conv_mode + '_mmtag'
+        print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
+
+    for index, row in tqdm(questions.iterrows(), total=len(questions)):
+        options = get_options(row, all_options)
+        cur_option_char = all_options[:len(options)]
+
+        if args.all_rounds:
+            num_rounds = len(options)
+        else:
+            num_rounds = 1
+
+        for round_idx in range(num_rounds):
+            idx = row['index']
+            question = row['question']
+            hint = row['hint']
+            image = load_image_from_base64(row['image'])
+            if not is_none(hint):
+                question = hint + '\n' + question
+            for option_char, option in zip(all_options[:len(options)], options):
+                question = question + '\n' + option_char + '. ' + option
+            qs = cur_prompt = question
+            
+            if hasattr(model, "update_prompt"):
+                model.update_prompt([[cur_prompt]])
+            
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+            if args.single_pred_prompt:
+                if args.lang == 'cn':
+                    qs = qs + '\n' + "请直接回答选项字母。"
+                else:
+                    qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=images,
+                    images_aux=images_aux,
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    # no_repeat_ngram_size=3,
+                    max_new_tokens=1024,
+                    bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                    eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                    pad_token_id=tokenizer.pad_token_id,  # Pad token
+                    use_cache=True)
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+            ans_id = shortuuid.uuid()
+            ans_file.write(json.dumps({"question_id": idx,
+                                    "round_id": round_idx,
+                                    "prompt": cur_prompt,
+                                    "text": outputs,
+                                    "options": options,
+                                    "option_char": cur_option_char,
+                                    "answer_id": ans_id,
+                                    "model_id": model_name,
+                                    "metadata": {}}) + "\n")
+            ans_file.flush()
+
+            # rotate options
+            options = options[1:] + options[:1]
+            cur_option_char = cur_option_char[1:] + cur_option_char[:1]
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--all-rounds", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/minigemini/eval/model_vqa_qbench.py b/minigemini/eval/model_vqa_qbench.py
new file mode 100644
index 0000000000000000000000000000000000000000..defc48cc5d4dcd9eb38bf10fb57de0a18133b29e
--- /dev/null
+++ b/minigemini/eval/model_vqa_qbench.py
@@ -0,0 +1,122 @@
+import argparse
+import torch
+from tqdm import tqdm
+import json
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+
+
+def load_image(image_file):
+    if image_file.startswith('http') or image_file.startswith('https'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, True)
+
+
+
+    
+    with open(args.questions_file) as f:
+        llvqa_data = json.load(f)  
+        
+    for i, llddata in enumerate(tqdm(llvqa_data)):
+        filename = llddata["img_path"]
+        if args.lang == "en":
+            message = llddata["question"] + "\nChoose between one of the options as follows:\n"
+        elif args.lang == "zh":
+            message = llddata["question"] + "\在下列选项中选择一个:\n"
+        else:
+            raise NotImplementedError("Q-Bench does not support languages other than English (en) and Chinese (zh) yet. Contact us (https://github.com/VQAssessment/Q-Bench/) to convert  Q-Bench into more languages.")
+        for choice, ans in zip(["A.", "B.", "C.", "D."], llddata["candidates"]):
+            message += f"{choice} {ans}\n"
+        qs = message
+        
+        if model.config.mm_use_im_start_end:
+            qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+
+        if 'llama-2' in model_name.lower():
+            conv_mode = "llava_llama_2"
+        elif "v1" in model_name.lower():
+            conv_mode = "llava_v1"
+        elif "mpt" in model_name.lower():
+            conv_mode = "mpt"
+        else:
+            conv_mode = "llava_v0"
+
+        if args.conv_mode is not None and conv_mode != args.conv_mode:
+            print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
+        else:
+            args.conv_mode = conv_mode
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        image = load_image(args.image_folder + filename)
+        image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+        keywords = [stop_str]
+        stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                num_beams=1,
+                do_sample=False,
+                temperature=0,
+                max_new_tokens=1024,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria])
+        
+        input_token_len = input_ids.shape[1]
+        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
+        if n_diff_input_output > 0:
+            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
+        outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
+        outputs = outputs.strip()
+        if outputs.endswith(stop_str):
+            outputs = outputs[:-len(stop_str)]
+        outputs = outputs.strip()
+        llddata["response"] = outputs
+        with open(args.answers_file, "a") as wf:
+            json.dump(llddata, wf)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="llava-v1.5")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="./playground/data/qbench/images_llvisionqa")
+    parser.add_argument("--questions-file", type=str, default="./playground/data/qbench/llvisionqa_dev.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--lang", type=str, default="en")
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/minigemini/eval/model_vqa_science.py b/minigemini/eval/model_vqa_science.py
new file mode 100644
index 0000000000000000000000000000000000000000..65faf5f007b3160bc897173a08eee47c7addf73a
--- /dev/null
+++ b/minigemini/eval/model_vqa_science.py
@@ -0,0 +1,162 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+
+from PIL import Image
+import math
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+
+    questions = json.load(open(os.path.expanduser(args.question_file), "r"))
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    for i, line in enumerate(tqdm(questions)):
+        idx = line["id"]
+        question = line['conversations'][0]
+        qs = question['value'].replace('<image>', '').strip()
+        cur_prompt = qs
+
+        if 'image' in line:
+            image_file = line["image"]
+            image = Image.open(os.path.join(args.image_folder, image_file))
+            
+            if hasattr(model.config, 'image_size_aux'):
+                if not hasattr(image_processor, 'image_size_raw'):
+                    image_processor.image_size_raw = image_processor.crop_size.copy()
+                image_processor.crop_size['height'] = model.config.image_size_aux
+                image_processor.crop_size['width'] = model.config.image_size_aux
+                image_processor.size['shortest_edge'] = model.config.image_size_aux
+            
+            image_tensor = process_images([image], image_processor, model.config)[0]
+            
+            image_grid = getattr(model.config, 'image_grid', 1)
+            if hasattr(model.config, 'image_size_aux'):
+                raw_shape = [image_processor.image_size_raw['height'] * image_grid, 
+                            image_processor.image_size_raw['width'] * image_grid]
+                image_tensor_aux = image_tensor
+                image_tensor = torch.nn.functional.interpolate(image_tensor[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+            else:
+                image_tensor_aux = []
+
+            if image_grid >= 2:            
+                raw_image = image_tensor.reshape(3, 
+                                                image_grid,
+                                                image_processor.image_size_raw['height'],
+                                                image_grid,
+                                                image_processor.image_size_raw['width'])
+                raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                raw_image = raw_image.reshape(-1, 3,
+                                            image_processor.image_size_raw['height'],
+                                            image_processor.image_size_raw['width'])
+                
+                if getattr(model.config, 'image_global', False):
+                    global_image = image_tensor
+                    if len(global_image.shape) == 3:
+                        global_image = global_image[None]
+                    global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                    # [image_crops, image_global]
+                    raw_image = torch.cat([raw_image, global_image], dim=0)
+                image_tensor = raw_image.contiguous()
+            
+            images = image_tensor[None].to(dtype=model.dtype, device='cuda', non_blocking=True)
+            images_aux = image_tensor_aux[None].to(dtype=model.dtype, device='cuda', non_blocking=True) if len(image_tensor_aux)>0 else None
+            if getattr(model.config, 'mm_use_im_start_end', False):
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            cur_prompt = '<image>' + '\n' + cur_prompt
+        else:
+            images = None
+            images_aux = None
+
+        if args.single_pred_prompt:
+            qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
+            cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
+
+        conv = conv_templates[args.conv_mode].copy()
+        conv.append_message(conv.roles[0], qs)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+        
+        if hasattr(model, "update_prompt"):
+            model.update_prompt([[cur_prompt]])
+        
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=images,
+                images_aux=images_aux,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=1024,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                use_cache=True,
+            )
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({"question_id": idx,
+                                   "prompt": cur_prompt,
+                                   "text": outputs,
+                                   "answer_id": ans_id,
+                                   "model_id": model_name,
+                                   "metadata": {}}) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.json")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v0")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--answer-prompter", action="store_true")
+    parser.add_argument("--single-pred-prompt", action="store_true")
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
diff --git a/minigemini/eval/qa_baseline_gpt35.py b/minigemini/eval/qa_baseline_gpt35.py
new file mode 100644
index 0000000000000000000000000000000000000000..babab6e12b4bb8cfa74a7edfa5e56cd1b3e2bf6c
--- /dev/null
+++ b/minigemini/eval/qa_baseline_gpt35.py
@@ -0,0 +1,74 @@
+"""Generate answers with GPT-3.5"""
+# Note: you need to be using OpenAI Python v0.27.0 for the code below to work
+import argparse
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import tqdm
+import shortuuid
+
+MODEL = 'gpt-3.5-turbo'
+MODEL_ID = 'gpt-3.5-turbo:20230327'
+
+def get_answer(question_id: int, question: str, max_tokens: int):
+    ans = {
+        'answer_id': shortuuid.uuid(),
+        'question_id': question_id,
+        'model_id': MODEL_ID,
+    }
+    for _ in range(3):
+        try:
+            response = openai.ChatCompletion.create(
+                model=MODEL,
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful assistant.'
+                }, {
+                    'role': 'user',
+                    'content': question,
+                }],
+                max_tokens=max_tokens,
+            )
+            ans['text'] = response['choices'][0]['message']['content']
+            return ans
+        except Exception as e:
+            print('[ERROR]', e)
+            ans['text'] = '#ERROR#'
+            time.sleep(1)
+    return ans
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+
+    questions_dict = {}
+    with open(os.path.expanduser(args.question)) as f:
+        for line in f:
+            if not line:
+                continue
+            q = json.loads(line)
+            questions_dict[q['question_id']] = q['text']
+
+    answers = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
+        futures = []
+        for qid, question in questions_dict.items():
+            future = executor.submit(get_answer, qid, question, args.max_tokens)
+            futures.append(future)
+
+        for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
+            answers.append(future.result())
+
+    answers.sort(key=lambda x: x['question_id'])
+
+    with open(os.path.expanduser(args.output), 'w') as f:
+        table = [json.dumps(ans) for ans in answers]
+        f.write('\n'.join(table))
diff --git a/minigemini/eval/run_llava.py b/minigemini/eval/run_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..7653f051d67059fef055319a22eaee083ab20e58
--- /dev/null
+++ b/minigemini/eval/run_llava.py
@@ -0,0 +1,143 @@
+import argparse
+import torch
+
+from minigemini.constants import (
+    IMAGE_TOKEN_INDEX,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IM_END_TOKEN,
+    IMAGE_PLACEHOLDER,
+)
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import (
+    process_images,
+    tokenizer_image_token,
+    get_model_name_from_path,
+)
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+import re
+
+
+def image_parser(args):
+    out = args.image_file.split(args.sep)
+    return out
+
+
+def load_image(image_file):
+    if image_file.startswith("http") or image_file.startswith("https"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        image = Image.open(image_file).convert("RGB")
+    return image
+
+
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        args.model_path, args.model_base, model_name
+    )
+
+    qs = args.query
+    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+    if IMAGE_PLACEHOLDER in qs:
+        if model.config.mm_use_im_start_end:
+            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
+        else:
+            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
+    else:
+        if model.config.mm_use_im_start_end:
+            qs = image_token_se + "\n" + qs
+        else:
+            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+
+    if "llama-2" in model_name.lower():
+        conv_mode = "llava_llama_2"
+    elif "mistral" in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif "v1.6-34b" in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif "v1" in model_name.lower():
+        conv_mode = "llava_v1"
+    elif "mpt" in model_name.lower():
+        conv_mode = "mpt"
+    else:
+        conv_mode = "llava_v0"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print(
+            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
+                conv_mode, args.conv_mode, args.conv_mode
+            )
+        )
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+
+    image_files = image_parser(args)
+    images = load_images(image_files)
+    images_tensor = process_images(
+        images,
+        image_processor,
+        model.config
+    ).to(model.device, dtype=torch.float16)
+
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+        .unsqueeze(0)
+        .cuda()
+    )
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=images_tensor,
+            do_sample=True if args.temperature > 0 else False,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            num_beams=args.num_beams,
+            max_new_tokens=args.max_new_tokens,
+            use_cache=True,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+    print(outputs)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, required=True)
+    parser.add_argument("--query", type=str, required=True)
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--sep", type=str, default=",")
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=512)
+    args = parser.parse_args()
+
+    eval_model(args)
\ No newline at end of file
diff --git a/minigemini/eval/summarize_gpt_review.py b/minigemini/eval/summarize_gpt_review.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f796a3880341739677a5fe3bfbcc90515a0f324
--- /dev/null
+++ b/minigemini/eval/summarize_gpt_review.py
@@ -0,0 +1,60 @@
+import json
+import os
+from collections import defaultdict
+
+import numpy as np
+
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-v', '--version', default=None)
+    parser.add_argument('-s', '--select', nargs='*', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=[])
+    parser.add_argument('-i', '--ignore', nargs='*', default=[])
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+
+    if len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
+
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        if args.select is not None and any(x not in config for x in args.select):
+            continue
+        if '0613' in config:
+            version = '0613'
+        else:
+            version = '0314'
+        if args.version is not None and args.version != version:
+            continue
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            # print(k, stats, round(stats[1]/stats[0]*100, 1))
+            print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')
diff --git a/minigemini/mm_utils.py b/minigemini/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b7db162fe85ec5c98c73739a1d7dc7fc51444fb
--- /dev/null
+++ b/minigemini/mm_utils.py
@@ -0,0 +1,105 @@
+from PIL import Image
+from io import BytesIO
+import base64
+
+import torch
+from transformers import StoppingCriteria
+from minigemini.constants import IMAGE_TOKEN_INDEX
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == 'pad':
+        for image in images:
+            image = expand2square(image.convert('RGB'), tuple(int(x*255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors='pt')['pixel_values']
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+    
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
\ No newline at end of file
diff --git a/minigemini/model/__init__.py b/minigemini/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94c34969117d4ce2b718e1c37c904e72263d2df
--- /dev/null
+++ b/minigemini/model/__init__.py
@@ -0,0 +1,7 @@
+from .language_model.mini_gemini_llama import MiniGeminiLlamaForCausalLM
+try:
+    from .language_model.mini_gemini_mistral import MiniGeminiMistralForCausalLM
+    from .language_model.mini_gemini_mixtral import MiniGeminiMixtralForCausalLM
+    from .language_model.mini_gemini_gemma import MiniGeminiGemmaForCausalLM
+except:
+    ImportWarning("New model not imported. Try to update Transformers.")
\ No newline at end of file
diff --git a/minigemini/model/builder.py b/minigemini/model/builder.py
new file mode 100755
index 0000000000000000000000000000000000000000..7af7a894ce67074e8720a7f2d969de966bf9f303
--- /dev/null
+++ b/minigemini/model/builder.py
@@ -0,0 +1,140 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+import os
+import warnings
+import logging
+
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from minigemini.model import *
+from minigemini.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs['device_map'] = {"": device}
+
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    
+    logging.getLogger("transformers").setLevel(logging.ERROR)
+    
+    if 'mini-gemini' in model_name.lower():        
+        # Load MiniGemini model
+        if model_base is not None:
+            # this may be mm projector only
+            print('Loading MiniGemini from base model...')
+            
+            if "8x7b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base)
+                model = MiniGeminiMixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            elif "2b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_base)
+                model = MiniGeminiGemmaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                model = MiniGeminiLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if "8x7b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = MiniGeminiMixtralForCausalLM.from_pretrained(model_path, **kwargs)
+            elif "2b" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path)
+                model = MiniGeminiGemmaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = MiniGeminiLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            if 'mpt' in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+
+    image_processor = None
+
+    mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+    mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+    if mm_use_im_patch_token:
+        tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+    if mm_use_im_start_end:
+        tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+    
+    model.resize_token_embeddings(len(tokenizer))
+
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+    vision_tower.to(device=device, dtype=torch.float16)
+    image_processor = vision_tower.image_processor
+    
+    if 'mini-gemini' in model_name.lower():
+        vision_tower_aux = model.get_vision_tower_aux()
+        if not vision_tower_aux.is_loaded:
+            vision_tower_aux.load_model()
+        vision_tower_aux.to(device=device, dtype=torch.float16)
+        
+        # initialize attention modules
+        model.config.model_path = model_path
+        model.get_model().initialize_uni_modules(model.config, for_eval=True)
+
+        model.get_model().vlm_uni_query_projector.to(device=device)
+        model.get_model().vlm_uni_aux_projector.to(device=device)
+        model.get_model().vlm_uni_val_projector.to(device=device)
+    
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    
+    logging.getLogger("transformers").setLevel(logging.WARNING)
+    
+    return tokenizer, model, image_processor, context_len
\ No newline at end of file
diff --git a/minigemini/model/consolidate.py b/minigemini/model/consolidate.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f7b6a871385d1c7eab17deaae1b17d3d5309a62
--- /dev/null
+++ b/minigemini/model/consolidate.py
@@ -0,0 +1,29 @@
+"""
+Usage:
+python3 -m minigemini.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from minigemini.model import *
+from minigemini.model.utils import auto_upgrade
+
+
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+
+    args = parser.parse_args()
+
+    consolidate_ckpt(args.src, args.dst)
diff --git a/minigemini/model/language_model/mini_gemini_gemma.py b/minigemini/model/language_model/mini_gemini_gemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..5780313517adf8b5888a3ca5d6394afda8d6606d
--- /dev/null
+++ b/minigemini/model/language_model/mini_gemini_gemma.py
@@ -0,0 +1,164 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+try:
+    from transformers import AutoConfig, AutoModelForCausalLM, \
+                            GemmaConfig, GemmaModel, GemmaForCausalLM
+except:
+    print("New model not imported. Try to update Transformers to 4.38.0 or later.")
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from transformers.generation.utils import logging
+
+from ..mini_gemini_arch import MiniGeminiMetaModel, MiniGeminiMetaForCausalLM
+
+logger = logging.get_logger(__name__)
+
+class MiniGeminiConfig(GemmaConfig):
+    model_type = "mini_gemini_gemma"
+
+
+class MiniGeminiGemmaModel(MiniGeminiMetaModel, GemmaModel):
+    config_class = MiniGeminiConfig
+    
+    def __init__(self, config: GemmaConfig):
+        super(MiniGeminiGemmaModel, self).__init__(config)
+
+class MiniGeminiGemmaForCausalLM(GemmaForCausalLM, MiniGeminiMetaForCausalLM):
+    config_class = MiniGeminiConfig
+
+    def __init__(self, config):
+        super(GemmaForCausalLM, self).__init__(config)
+        self.model = MiniGeminiGemmaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None, 
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                images_aux
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        images_aux = kwargs.pop("images_aux", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if images_aux is not None:
+            _inputs['images_aux'] = images_aux
+        return _inputs
+
+AutoConfig.register("mini_gemini_gemma", MiniGeminiConfig)
+AutoModelForCausalLM.register(MiniGeminiConfig, MiniGeminiGemmaForCausalLM)
\ No newline at end of file
diff --git a/minigemini/model/language_model/mini_gemini_llama.py b/minigemini/model/language_model/mini_gemini_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8a92930bce85ecf2ca176983ad3b34e6f42c18
--- /dev/null
+++ b/minigemini/model/language_model/mini_gemini_llama.py
@@ -0,0 +1,203 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.utils import logging
+from transformers.generation.utils import GenerateOutput
+
+from minigemini.model.mini_gemini_arch import MiniGeminiMetaModel, MiniGeminiMetaForCausalLM
+from torch.nn import CrossEntropyLoss
+
+
+logger = logging.get_logger(__name__)
+
+class MiniGeminiConfig(LlamaConfig):
+    model_type = "mini_gemini"
+
+class MiniGeminiLlamaModel(MiniGeminiMetaModel, LlamaModel):
+    config_class = MiniGeminiConfig
+    
+    def __init__(self, config: LlamaConfig):
+        super(MiniGeminiLlamaModel, self).__init__(config)
+
+
+class MiniGeminiLlamaForCausalLM(LlamaForCausalLM, MiniGeminiMetaForCausalLM):
+    config_class = MiniGeminiConfig
+
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = MiniGeminiLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                images_aux
+            )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        images_aux = kwargs.pop("images_aux", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if images_aux is not None:
+            _inputs['images_aux'] = images_aux
+        return _inputs
+
+AutoConfig.register("mini_gemini", MiniGeminiConfig)
+AutoModelForCausalLM.register(MiniGeminiConfig, MiniGeminiLlamaForCausalLM)
\ No newline at end of file
diff --git a/minigemini/model/language_model/mini_gemini_mistral.py b/minigemini/model/language_model/mini_gemini_mistral.py
new file mode 100644
index 0000000000000000000000000000000000000000..8078c5998f1a0decf5dd017207ff91eaceea1496
--- /dev/null
+++ b/minigemini/model/language_model/mini_gemini_mistral.py
@@ -0,0 +1,162 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MistralConfig, MistralModel, MistralForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from transformers.generation.utils import logging
+
+from ..mini_gemini_arch import MiniGeminiMetaModel, MiniGeminiMetaForCausalLM
+
+logger = logging.get_logger(__name__)
+
+class MiniGeminiConfig(MistralConfig):
+    model_type = "mini_gemini_mistral"
+
+
+class MiniGeminiMistralModel(MiniGeminiMetaModel, MistralModel):
+    config_class = MiniGeminiConfig
+    
+    def __init__(self, config: MistralConfig):
+        super(MiniGeminiMistralModel, self).__init__(config)
+        # self.max_pos_idx = 0
+
+class MiniGeminiMistralForCausalLM(MistralForCausalLM, MiniGeminiMetaForCausalLM):
+    config_class = MiniGeminiConfig
+
+    def __init__(self, config):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = MiniGeminiMistralModel(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                images_aux
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        images_aux = kwargs.pop("images_aux", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if images_aux is not None:
+            _inputs['images_aux'] = images_aux
+        return _inputs
+
+AutoConfig.register("mini_gemini_mistral", MiniGeminiConfig)
+AutoModelForCausalLM.register(MiniGeminiConfig, MiniGeminiMistralForCausalLM)
\ No newline at end of file
diff --git a/minigemini/model/language_model/mini_gemini_mixtral.py b/minigemini/model/language_model/mini_gemini_mixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bd1b4402acaefae6dbcb241111d7cb6e96ff76
--- /dev/null
+++ b/minigemini/model/language_model/mini_gemini_mixtral.py
@@ -0,0 +1,162 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         MixtralConfig, MixtralModel, MixtralForCausalLM
+
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from transformers.generation.utils import logging
+
+from ..mini_gemini_arch import MiniGeminiMetaModel, MiniGeminiMetaForCausalLM
+
+logger = logging.get_logger(__name__)
+
+class MiniGeminiConfig(MixtralConfig):
+    model_type = "mini_gemini_mixtral"
+
+
+class MiniGeminiMixtralModel(MiniGeminiMetaModel, MixtralModel):
+    config_class = MiniGeminiConfig
+    
+    def __init__(self, config: MixtralConfig):
+        super(MiniGeminiMixtralModel, self).__init__(config)
+        # self.max_pos_idx = 0
+
+class MiniGeminiMixtralForCausalLM(MixtralForCausalLM, MiniGeminiMetaForCausalLM):
+    config_class = MiniGeminiConfig
+
+    def __init__(self, config):
+        super(MixtralForCausalLM, self).__init__(config)
+        self.model = MiniGeminiMixtralModel(config)
+        # self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                images_aux
+            )        
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_aux: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                images_aux
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        images_aux = kwargs.pop("images_aux", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        if images_aux is not None:
+            _inputs['images_aux'] = images_aux
+        return _inputs
+
+AutoConfig.register("mini_gemini_mixtral", MiniGeminiConfig)
+AutoModelForCausalLM.register(MiniGeminiConfig, MiniGeminiMixtralForCausalLM)
\ No newline at end of file
diff --git a/minigemini/model/llava_arch.py b/minigemini/model/llava_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..0969ff439dc0922c7f0f9d8317211c229fdc67ab
--- /dev/null
+++ b/minigemini/model/llava_arch.py
@@ -0,0 +1,299 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from .multimodal_encoder.builder import build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+
+from minigemini.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+
+
+class LlavaMetaModel:
+
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        self.config.mm_vision_tower = vision_tower
+
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+
+            if 'model' in mm_projector_weights.keys():
+                mm_projector_weights = mm_projector_weights['model']
+                status = self.mm_projector.load_state_dict(mm_projector_weights, strict=False)
+                print('missing_keys:', status.missing_keys)
+            else:
+                status = self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
+                print('missing_keys:', status.missing_keys)
+
+            # class_embedding_weights = get_w(mm_projector_weights, 'model.vision_tower.vision_tower.vision_model.embeddings')
+            # if len(class_embedding_weights) > 0:
+            #     self.vision_tower.vision_tower.vision_model.embeddings.load_state_dict(class_embedding_weights, strict=False)
+
+
+class LlavaMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+
+    def encode_images(self, images=None, points=None):
+        if images is not None:
+            image_features = self.get_model().get_vision_tower()(images)
+            image_features = self.get_model().mm_projector(image_features)
+        if points is not None:
+            # use pre-computed features here
+            point_features = [self.get_model().mm_projector(_point).squeeze() for _point in points]
+        return image_features
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images=None, points=None
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
+        else:
+            image_features = self.encode_images(images).to(self.device)
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+
+            cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
\ No newline at end of file
diff --git a/minigemini/model/mini_gemini_arch.py b/minigemini/model/mini_gemini_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..da566b321c48ffe5f6fb572f023be51f386062a9
--- /dev/null
+++ b/minigemini/model/mini_gemini_arch.py
@@ -0,0 +1,497 @@
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import json
+import os
+import transformers
+import safetensors
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+import deepspeed
+
+from .multimodal_encoder.builder import build_vision_tower, build_vision_tower_aux
+from .multimodal_projector.builder import build_vision_projector
+
+from minigemini.constants import (IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, 
+                             DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
+
+IS_NEW_TRANSFORMERS = transformers.__version__ >= "4.34.0"
+
+class MiniGeminiMetaModel:
+
+    def __init__(self, config):
+        super(MiniGeminiMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = build_vision_projector(config)
+
+        if hasattr(config, "mm_vision_tower_aux"):
+            self.vision_tower_aux = build_vision_tower_aux(config, delay_load=True)
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+
+    def get_vision_tower_aux(self):
+        vision_tower_aux = getattr(self, 'vision_tower_aux', None)
+        if type(vision_tower_aux) is list:
+            vision_tower_aux = vision_tower_aux[0]
+        return vision_tower_aux
+
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        vision_tower_aux = model_args.vision_tower_aux
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        self.config.mm_vision_tower = vision_tower
+        self.config.mm_vision_tower_aux = vision_tower_aux
+
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+
+            if fsdp is not None and len(fsdp) > 0:
+                self.vision_tower = [vision_tower]
+            else:
+                self.vision_tower = vision_tower
+        else:
+            if fsdp is not None and len(fsdp) > 0:
+                vision_tower = self.vision_tower[0]
+            else:
+                vision_tower = self.vision_tower
+            vision_tower.load_model()
+
+        if vision_tower_aux is not None:
+            if self.get_vision_tower_aux() is None:
+                vision_tower_aux = build_vision_tower_aux(model_args)
+
+                if fsdp is not None and len(fsdp) > 0:
+                    self.vision_tower_aux = [vision_tower_aux]
+                else:
+                    self.vision_tower_aux = vision_tower_aux
+            else:
+                if fsdp is not None and len(fsdp) > 0:
+                    vision_tower_aux = self.vision_tower_aux[0]
+                else:
+                    vision_tower_aux = self.vision_tower_aux
+                vision_tower_aux.load_model()
+            self.config.mm_hidden_size_aux = vision_tower_aux.hidden_size
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+
+        if getattr(self, 'mm_projector', None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+
+            if 'model' in mm_projector_weights.keys():
+                mm_projector_weights = mm_projector_weights['model']
+                if is_deepspeed_zero3_enabled():
+                    if len(mm_projector_weights) > 0:
+                        with deepspeed.zero.GatheredParameters(mm_projector_weights, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                self.mm_projector.load_state_dict(mm_projector_weights)
+                else:
+                    status = self.mm_projector.load_state_dict(mm_projector_weights, strict=False)
+                    print('missing_keys:', status.missing_keys)
+            else:
+                if is_deepspeed_zero3_enabled():
+                    named_parameters = get_w(mm_projector_weights, 'mm_projector')
+                    if len(named_parameters) > 0:
+                        with deepspeed.zero.GatheredParameters(named_parameters, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                self.mm_projector.load_state_dict(named_parameters)
+                else:
+                    status = self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
+                    print('missing_keys:', status.missing_keys)
+            self.mm_projector = self.mm_projector.to(device=self.device)
+
+    def initialize_uni_modules(self, model_args, for_eval=False):  
+        pretrain_mm_mlp_adapter = getattr(model_args, "pretrain_mm_mlp_adapter", None)
+        self.config.image_size_aux = getattr(model_args, 'image_size_aux', 320)
+        self.config.optimize_vision_tower = getattr(model_args, 'optimize_vision_tower', False)
+        self.config.optimize_vision_tower_aux = getattr(model_args, 'optimize_vision_tower_aux', False)
+
+        self.vlm_uni_query_projector  = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size), 
+                                                      nn.Linear(self.config.mm_hidden_size, self.config.mm_hidden_size))
+        self.vlm_uni_aux_projector  = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size_aux),
+                                                    nn.Linear(self.config.mm_hidden_size_aux, self.config.mm_hidden_size))
+        self.vlm_uni_val_projector  = nn.Sequential(nn.LayerNorm(self.config.mm_hidden_size_aux),
+                                                    nn.Linear(self.config.mm_hidden_size_aux, self.config.mm_hidden_size))
+        
+        if pretrain_mm_mlp_adapter is not None:
+            projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+        else:
+            trainable_module = ['vlm_uni', 'vision_fpn', 'vision_stages']
+            if hasattr(model_args, 'model_name_or_path'):
+                model_save_path = model_args.model_name_or_path
+            else:
+                model_save_path = model_args.model_path
+            model_idx_path = getattr(model_args, 'model_path', model_save_path)
+            if IS_NEW_TRANSFORMERS:
+                try:
+                    weight_file = json.load(open(os.path.join(model_idx_path, 'model.safetensors.index.json'), 'r'))['weight_map']
+                except:
+                    weight_file = json.load(open(os.path.join(model_idx_path, 'pytorch_model.bin.index.json'), 'r'))['weight_map']
+            else:
+                weight_file = json.load(open(os.path.join(model_idx_path, 'pytorch_model.bin.index.json'), 'r'))['weight_map']
+            model_path = set([weight_file[_key] for _key in weight_file if any([_module in _key for _module in trainable_module])])
+            projector_weights = {}
+            for _model in model_path:
+                if not IS_NEW_TRANSFORMERS:
+                    projector_weights.update(torch.load(os.path.join(model_idx_path, _model), map_location='cpu'))
+                else:
+                    with safetensors.safe_open(os.path.join(model_idx_path, _model), framework="pt", device='cpu') as f:
+                        for _key in f.keys():
+                            projector_weights.update({_key: f.get_tensor(_key)})
+            if len(projector_weights) == 0:
+                return
+
+        def get_w(weights, keyword, main_module, sub_module):
+            if getattr(main_module, sub_module, None) is None:
+                return
+            
+            pretrain_weight = {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword + '.' in k}
+            if len(pretrain_weight) == 0:
+                return
+            if is_deepspeed_zero3_enabled():
+                named_parameters = [v for k, v in getattr(main_module, sub_module).named_parameters()]
+                if len(named_parameters) > 0:
+                    # because zero3 puts placeholders in model params, this context
+                    # manager gathers (unpartitions) the params of the current layer, then loads from
+                    # the state dict and then re-partitions them again
+                    with deepspeed.zero.GatheredParameters(named_parameters, modifier_rank=0):
+                        if torch.distributed.get_rank() == 0:
+                            getattr(main_module, sub_module).load_state_dict(pretrain_weight)
+                    with deepspeed.zero.GatheredParameters(self.mm_projector[0].weight, modifier_rank=None):
+                        weight_type = self.mm_projector[0].weight.dtype
+                        device_type = self.mm_projector[0].weight.device
+            else:
+                weight_type = self.mm_projector[0].weight.dtype
+                device_type = self.mm_projector[0].weight.device
+                getattr(main_module, sub_module).load_state_dict(pretrain_weight)
+            if weight_type == torch.uint8 or weight_type == torch.int8 or weight_type == torch.int16:
+                weight_type = torch.float16
+
+            getattr(main_module, sub_module).to(dtype=weight_type)
+            print(f"Loading {sub_module} weights...")
+        
+        # load pretrained weights
+        get_w(projector_weights, 'vision_tower.vision_tower', self.vision_tower, 'vision_tower')
+
+        # load pretrained weights
+        if self.config.optimize_vision_tower_aux:
+            # not optimize vision stem, just used to check
+            get_w(projector_weights, 'vision_tower_aux.vision_stem', self.vision_tower_aux, 'vision_stem')
+            get_w(projector_weights, 'vision_tower_aux.vision_stages', self.vision_tower_aux, 'vision_stages')
+        get_w(projector_weights, 'vlm_uni_query_projector', self, 'vlm_uni_query_projector')
+        get_w(projector_weights, 'vlm_uni_aux_projector', self, 'vlm_uni_aux_projector')
+        get_w(projector_weights, 'vlm_uni_val_projector', self, 'vlm_uni_val_projector')
+    
+class MiniGeminiMetaForCausalLM(ABC):
+
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+
+    def get_vision_tower_aux(self):
+        return self.get_model().get_vision_tower_aux()
+
+    def encode_images(self, images, images_aux=None, is_video=False):
+        image_grid = getattr(self.config, 'image_grid', 1)
+        image_global = getattr(self.config, 'image_global', False)
+        if image_grid > 1:
+            batch_size = images.shape[0]
+            if image_global:
+                global_images = images[:, -1:].flatten(0,1).contiguous()
+                grid_images = images[:, :-1].flatten(0,1).contiguous()
+                images = torch.cat([grid_images, global_images], dim=0)
+            else:
+                images = images.flatten(0,1).contiguous()
+        
+        image_features = self.get_model().get_vision_tower()(images)
+        
+        if image_global:
+            image_feat_global = image_features[-len(global_images):]
+            image_features = image_features[:len(grid_images)]
+        
+        if images_aux is not None:
+            image_aux_features_raw = self.get_model().get_vision_tower_aux()(images_aux).to(
+                dtype=image_features.dtype, device=image_features.device)
+            
+            if image_global:
+                image_aux_features_global = F.interpolate(image_aux_features_raw.float(), 
+                                                            scale_factor=1/image_grid, 
+                                                            mode='bilinear', 
+                                                            align_corners=False).to(dtype=image_aux_features_raw.dtype)
+                image_feat_global, image_aux_feat_global = self.unified_resampler(image_feat_global, image_aux_features_global)
+
+            if image_grid > 1:
+                image_aux_features_raw = image_aux_features_raw.reshape(*image_aux_features_raw.shape[:2],
+                                                                        image_grid,
+                                                                        image_aux_features_raw.shape[-2]//image_grid,
+                                                                        image_grid,
+                                                                        image_aux_features_raw.shape[-1]//image_grid)
+                image_aux_features_raw = image_aux_features_raw.permute(0, 2, 4, 1, 3, 5).flatten(1,2).flatten(0,1).contiguous()
+            image_features, image_aux_features = self.unified_resampler(image_features, image_aux_features_raw)
+            
+            if image_grid > 1:
+                image_features = image_features.reshape(batch_size, image_grid**2, *image_features.shape[1:])
+                image_features = image_features.flatten(1,2).contiguous()
+                image_aux_features = image_aux_features.reshape(batch_size, image_grid**2, *image_aux_features.shape[1:])
+                image_aux_features = image_aux_features.flatten(1,2).contiguous()
+            
+            # add global features, [global, local]
+            if image_global:
+                image_features = torch.cat([image_feat_global, image_features], dim=1)
+                image_aux_features = torch.cat([image_aux_feat_global, image_aux_features], dim=1)
+            
+            # token generation
+            image_features = image_features + image_aux_features
+        
+        # process image features after token generation
+        image_features = self.get_model().mm_projector(image_features)
+        
+        return image_features
+
+    def unified_resampler(self, images, images_aux):
+        # patchwise with square images
+        patch_num = int(images.shape[1]**0.5)
+        patch_size = images_aux.shape[-1]//patch_num
+        # within patch attention
+        images_aux = images_aux.permute(0,2,3,1)
+        images_aux = images_aux.reshape(len(images_aux), patch_num, patch_size, patch_num, patch_size, images_aux.shape[-1])
+        images_aux = images_aux.permute(0,1,3,2,4,5)
+        images_aux = images_aux.reshape(len(images_aux), patch_num**2, patch_size**2, images_aux.shape[-1]).contiguous()
+
+        # token 
+        print(self.get_model().vlm_uni_query_projector[0].weight.device)
+        embed_query = self.get_model().vlm_uni_query_projector(images)
+        embed_aux = self.get_model().vlm_uni_aux_projector(images_aux)
+        embed_value = self.get_model().vlm_uni_val_projector(images_aux) 
+        embed_att = embed_query[:,:,None] @ (embed_aux.transpose(-1,-2) / (embed_aux.shape[-1]**0.5))
+        embed_att = embed_att.nan_to_num()
+        embed_feat = (embed_att.softmax(-1) @ embed_value).mean(2)
+        
+        return images, embed_feat
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images=None, images_aux=None,
+    ):        
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:                
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat((attention_mask, torch.ones(
+                    (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device
+                )), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        image_features = self.encode_images(images, images_aux)
+
+        # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+            raise NotImplementedError
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
+
+        new_input_embeds = []
+        new_labels = []
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            if num_images == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+
+            image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
+            cur_input_ids_noim = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim = []
+            for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
+            split_sizes = [x.shape[0] for x in cur_labels_noim]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
+            cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            
+            max_pos_id = 0
+            for i in range(num_images + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im[i])
+                cur_new_labels.append(cur_labels_noim[i])
+                max_pos_id += cur_input_embeds_no_im[i].shape[0]
+                if i < num_images:
+                    cur_image_features = image_features[cur_image_idx]
+                    cur_image_idx += 1
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
+                    max_pos_id += cur_image_features.shape[0]
+            
+            cur_new_input_embeds = [x.to(device=cur_input_embeds.device) for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
+                new_input_embeds_padded.append(torch.cat((
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
+                    cur_new_embed
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)  
+            else:
+                new_input_embeds_padded.append(torch.cat((
+                    cur_new_embed,
+                    torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
+                ), dim=0))
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
+                    
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
diff --git a/minigemini/model/multimodal_encoder/builder.py b/minigemini/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d026b3b2f5197fad19401f79bcf2818a5e71efe
--- /dev/null
+++ b/minigemini/model/multimodal_encoder/builder.py
@@ -0,0 +1,34 @@
+import os
+from .clip_encoder import CLIPVisionTower
+from .openclip_encoder import OpenCLIPVisionTower
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "../processor/clip-patch14-224"))
+    
+    # if not os.path.exists(vision_tower):
+    #     raise ValueError(f'Not find vision tower: {vision_tower}')
+
+    if "openai" in vision_tower.lower() or "ShareGPT4V" in vision_tower:
+        vision_tower = 'openai/clip-vit-large-patch14-336'
+        # vision_tower = '/dataset/chengyaowang/official/MiniGemini/model_zoo/OpenAI/clip-vit-large-patch14-336'
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower}')
+
+
+def build_vision_tower_aux(vision_tower_cfg, **kwargs):
+    vision_tower_aux = getattr(vision_tower_cfg, 'mm_vision_tower_aux', getattr(vision_tower_cfg, 'vision_tower_aux', None))
+    
+    # if not os.path.exists(vision_tower_aux):
+    #     raise ValueError(f'Not find vision tower: {vision_tower_aux}')
+
+    if "openclip" in vision_tower_aux.lower():
+        vision_tower_aux = './checkpoints/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup'
+        # vision_tower_aux = '/dataset/chengyaowang/official/MiniGemini/model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup'
+        return OpenCLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
+    elif "openai" in vision_tower_aux.lower():
+        return CLIPVisionTower(vision_tower_aux, args=vision_tower_cfg, **kwargs)
+    else:
+        raise ValueError(f'Unknown vision tower: {vision_tower_aux}')
\ No newline at end of file
diff --git a/minigemini/model/multimodal_encoder/clip_encoder.py b/minigemini/model/multimodal_encoder/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb2578d11c1e46c48f18ee428650740d617dfa52
--- /dev/null
+++ b/minigemini/model/multimodal_encoder/clip_encoder.py
@@ -0,0 +1,89 @@
+import torch
+import torch.nn as nn
+
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+from ..processor.video_processor import VideoFramesProcessor
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.is_optimize = getattr(args, 'optimize_vision_tower', False)
+        
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = VideoFramesProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    def image_forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        
+        return image_features
+
+    def forward(self, images):
+        if not self.is_optimize:
+            with torch.no_grad():
+                image_features = self.image_forward(images)
+        else:
+            image_features = self.image_forward(images)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
\ No newline at end of file
diff --git a/minigemini/model/multimodal_encoder/eva_encoder.py b/minigemini/model/multimodal_encoder/eva_encoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..50443b2a0fbbaa21a808a7185a7b0df273eed34b
--- /dev/null
+++ b/minigemini/model/multimodal_encoder/eva_encoder.py
@@ -0,0 +1,551 @@
+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from transformers import CLIPImageProcessor, CLIPVisionConfig
+from ..processor.video_processor import VideoFramesProcessor
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement 
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+        
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+        
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+#         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+#         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+#         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+#         if isinstance(self.head, nn.Linear):
+#             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+#         if isinstance(self.head, nn.Linear):
+#             self.head.weight.data.mul_(init_scale)
+#             self.head.bias.data.mul_(init_scale)
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+#         x = self.norm(x)
+
+#         if self.fc_norm is not None:
+#             t = x[:, 1:, :]
+#             return self.fc_norm(t.mean(1))
+#         else:
+#             return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+#         x = self.head(x)
+        return x
+
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+
+        return features
+    
+    @property
+    def dtype(self):
+        return self.cls_token.dtype
+    
+    @property
+    def device(self):
+        return self.cls_token.device
+    
+    def get_num_layer(self, var_name=""):
+        if var_name in ("cls_token", "mask_token", "pos_embed"):
+            return 0
+        elif var_name.startswith("patch_embed"):
+            return 0
+        elif var_name.startswith("rel_pos_bias"):
+            return len(self.blocks) - 1
+        elif var_name.startswith("blocks"):
+            layer_id = int(var_name.split('.')[1])
+            return layer_id + 1
+        else:
+            return len(self.blocks)
+        
+            
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+            
+            
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+#         if isinstance(l, (nn.MultiheadAttention, Attention)):
+#             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+#                 tensor = getattr(l, attr)
+#                 if tensor is not None:
+#                     tensor.data = tensor.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+class EVAVisionTower(nn.Module):
+    def __init__(self, vision_tower, image_processor, args, use_checkpoint=False, drop_path_rate=0.0, delay_load=False, dtype=torch.float32):
+        super().__init__()
+        
+        self.is_loaded = False
+        self.use_checkpoint = use_checkpoint
+        self.vision_tower_name = vision_tower
+        self.image_processor_name = image_processor
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = 14
+        self.out_channel = 1408
+        if not delay_load:
+            self.load_model()
+        
+        self.vision_config = CLIPVisionConfig.from_pretrained(image_processor)
+        
+    def load_model(self):
+        # self.image_processor = CLIPImageProcessor.from_pretrained(self.image_processor_name)
+        self.image_processor = VideoFramesProcessor.from_pretrained(self.image_processor_name)
+        self.vision_tower = VisionTransformer(
+            img_size=self.image_processor.size['shortest_edge'],
+            patch_size=self.patch_size,
+            use_mean_pooling=False,
+            embed_dim=self.out_channel,
+            depth=39,
+            num_heads=self.out_channel//88,
+            mlp_ratio=4.3637,
+            qkv_bias=True,
+            drop_path_rate=self.drop_path_rate,
+            norm_layer=partial(nn.LayerNorm, eps=1e-6),
+            use_checkpoint=self.use_checkpoint,
+        )  
+        
+        state_dict = torch.load(self.vision_tower_name, map_location="cpu")    
+        interpolate_pos_embed(self.vision_tower, state_dict)
+        incompatible_keys = self.vision_tower.load_state_dict(state_dict, strict=False)
+        print(incompatible_keys)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = image_forward_out.to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+            image_features = image_forward_outs.to(images.dtype)
+
+        return image_features
+
+    def feature_select(self, image_features):
+        # image_features = image_features.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        return self.vision_config
+
+    @property
+    def hidden_size(self):
+        return self.out_channel
+
+    @property
+    def num_patches(self):
+        return (self.image_processor.size['shortest_edge'] // self.patch_size) ** 2
+    
+
+    
+def create_eva_vit_g(img_size=224,drop_path_rate=0.4,use_checkpoint=False,model_path=None,precision="fp16"):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408//88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )  
+    # url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+    # cached_file = download_cached_file(
+    #     url, check_hash=False, progress=True
+    # )
+    state_dict = torch.load(model_path, map_location="cpu")    
+    interpolate_pos_embed(model,state_dict)
+    
+    incompatible_keys = model.load_state_dict(state_dict, strict=False)
+    print(incompatible_keys)
+    
+    if precision == "fp16":
+        convert_weights_to_fp16(model)
+    return model
\ No newline at end of file
diff --git a/minigemini/model/multimodal_encoder/openclip_encoder.py b/minigemini/model/multimodal_encoder/openclip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcee829a17a966c7bf01bbc51ba4e6e0701f79a0
--- /dev/null
+++ b/minigemini/model/multimodal_encoder/openclip_encoder.py
@@ -0,0 +1,225 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import json
+import logging
+import deepspeed
+from pathlib import Path
+from open_clip.factory import load_state_dict, get_model_config
+from open_clip.model import CLIPVisionCfg, CLIPTextCfg, _build_vision_tower, convert_to_custom_text_state_dict, resize_pos_embed
+from typing import Dict, Optional
+from transformers.deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
+
+open_clip_config = {
+  "model_cfg": {
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "convnext_large",
+      "timm_model_pretrained": False,
+      "timm_pool": "",
+      "timm_proj": "mlp",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 320
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 16
+    }
+  },
+  "preprocess_cfg": {
+    "mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ]
+  }
+}
+
+# xxx
+class OpenCLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.vision_config = open_clip_config
+        # json.load(open(os.path.join(vision_tower,'open_clip_config.json'), 'r'))
+        self.is_optimize = getattr(args, 'optimize_vision_tower_aux', False)
+
+        if not delay_load:
+            self.load_model()
+
+    def load_model(self):
+        # print(self.vision_tower_name)
+
+        ckpt_path = os.path.join(self.vision_tower_name, 'open_clip_pytorch_model.bin')
+        if 'convnext' in self.vision_tower_name:
+            if 'large' in self.vision_tower_name and 'd_320' in self.vision_tower_name:
+                self.model_type = 'convnext_large_d_320'
+                self.model_channel = [192, 384, 768, 1536] # stage 0-3
+            elif 'base' in self.vision_tower_name and 'w_320' in self.vision_tower_name:
+                self.model_type = 'convnext_base_w_320'
+                self.model_channel = [128, 256, 512, 1024]
+            elif 'xxlarge' in self.vision_tower_name:
+                self.model_type = 'convnext_xxlarge'
+                self.model_channel = [384, 768, 1536, 3072]
+
+        clip_model = CLIP(**get_model_config(self.model_type))
+        clip_model.visual.trunk.norm_pre = None
+        clip_model.visual.trunk.head = None
+        clip_model.visual.head = None
+        print(f'Loading pretrained weights ({self.model_type}).')
+        load_checkpoint(clip_model, ckpt_path, strict=False)
+
+        self.is_loaded = True
+        # decompose stem and stages blocks in vision tower
+        self.vision_stem = clip_model.visual.trunk.stem
+        self.vision_stages = clip_model.visual.trunk.stages
+        self.vision_stem.requires_grad_(False)
+        self.vision_stages.requires_grad_(False)
+    
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.backbone(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_features.append(image_feature)
+        else:
+            image_features = self.backbone(images.to(device=self.device, dtype=self.dtype))
+
+        return image_features
+
+    def backbone(self, images):
+        if not self.is_optimize:
+            with torch.no_grad():
+                results = self.basic_forward(images)
+        else:
+            results = self.basic_forward(images)
+
+        target_size = (results['stage_0'].shape[-2], results['stage_0'].shape[-1])
+        result_cat = []
+        for _stage in results:
+            if _stage == 'stage_0':
+                result_cat.append(results[_stage].contiguous())
+            else:
+                result_cat.append(F.interpolate(results[_stage].float().contiguous() , 
+                                                size=target_size, 
+                                                mode='bilinear', 
+                                                align_corners=False).to(dtype=results[_stage].dtype))
+        result_cat = torch.cat(result_cat, dim=1)
+
+        return result_cat.contiguous()
+
+    def basic_forward(self, images):
+        results = {}    
+        x = self.vision_stem(images)
+        for _idx in range(len(self.vision_stages)):
+            x = self.vision_stages[_idx](x)
+            results[f'stage_{_idx}'] = x
+        return results
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_stem[0].weight.dtype
+
+    @property
+    def device(self):
+        return self.vision_stem[0].weight.device
+
+    @property
+    def config(self):
+        return self.vision_config
+
+    @property
+    def hidden_size(self):
+        return sum(self.model_channel)
+
+# modified function from open_clip to support zero3 stage
+def load_checkpoint(model, checkpoint_path, strict=True):
+    if Path(checkpoint_path).suffix in ('.npz', '.npy'):
+        from open_clip.big_vision import load_big_vision_weights
+        load_big_vision_weights(model, checkpoint_path)
+        return {}
+
+    state_dict = load_state_dict(checkpoint_path)
+    # detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712
+    # if 'logit_bias' not in state_dict and model.logit_bias is not None:
+    #     state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"])
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
+    resize_pos_embed(state_dict, model)
+    # resize_text_pos_embed(state_dict, model)
+    #incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    if is_deepspeed_zero3_enabled():
+
+        error_msgs = []
+
+        def load(module: nn.Module, state_dict, prefix=""):
+            metadata = None
+
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+            # Parameters of module and children will start with prefix. We can exit early if there are none in this
+            # state_dict
+            if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+                if is_deepspeed_zero3_enabled():
+                    # In sharded models, each shard has only part of the full state_dict, so only gather
+                    # parameters that are in the current state_dict.
+                    named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
+                    params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
+                    if len(params_to_gather) > 0:
+                        # because zero3 puts placeholders in model params, this context
+                        # manager gathers (unpartitions) the params of the current layer, then loads from
+                        # the state dict and then re-partitions them again
+                        with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
+                            if torch.distributed.get_rank() == 0:
+                                module._load_from_state_dict(*args)
+                else:
+                    module._load_from_state_dict(*args)
+
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, state_dict, prefix + name + ".")
+
+        load(model, state_dict)
+        incompatible_keys = []
+    else:
+        incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+        logging.info(f"incompatible_keys.missing_keys: {incompatible_keys.missing_keys}")
+    return incompatible_keys
+
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
diff --git a/minigemini/model/multimodal_projector/builder.py b/minigemini/model/multimodal_projector/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..86fc1cb1824611fed5f502a96924a397c4c28610
--- /dev/null
+++ b/minigemini/model/multimodal_projector/builder.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+import re
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+
+
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+
+    if projector_type == 'identity':
+        return IdentityMap()
+
+    raise ValueError(f'Unknown projector type: {projector_type}')
\ No newline at end of file
diff --git a/minigemini/model/processor/video_processor.py b/minigemini/model/processor/video_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87d09519b0e76ddcee5bf06cf7dbec3013e9091
--- /dev/null
+++ b/minigemini/model/processor/video_processor.py
@@ -0,0 +1,74 @@
+from transformers import CLIPImageProcessor
+from transformers.image_processing_utils import BatchFeature, get_size_dict
+from transformers.image_transforms import get_resize_output_image_size
+
+import torch
+import torch.nn.functional as F
+
+import numpy as np
+
+
+class VideoFramesProcessor(CLIPImageProcessor):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def preprocess(self, images, **kwargs):
+        if not isinstance(images, np.ndarray):
+            return super().preprocess(images=images, **kwargs)
+        
+        do_resize = kwargs.get('do_resize', self.do_resize)
+        size = kwargs.get('size', self.size)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        do_center_crop = kwargs.get('do_center_crop', self.do_center_crop)
+        crop_size = kwargs.get('crop_size', self.crop_size)
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = kwargs.get('do_rescale', self.do_rescale)
+        rescale_factor = kwargs.get('rescale_factor', self.rescale_factor)
+        do_normalize = kwargs.get('do_normalize', self.do_normalize)
+        image_mean = kwargs.get('image_mean', self.image_mean)
+        image_std = kwargs.get('image_std', self.image_std)
+        return_tensors = kwargs.get('return_tensors', None)
+
+        def resize(images, output_size):
+            images = images.permute((0, 3, 1, 2))
+            images = F.interpolate(images, size=output_size, mode='bicubic')
+            images = images.permute((0, 2, 3, 1))
+            return images
+
+        def center_crop(images, crop_size):
+            crop_width, crop_height = crop_size["width"], crop_size["height"]
+            img_width, img_height = images.shape[1:3]
+            x = (img_width - crop_width) // 2
+            y = (img_height - crop_height) // 2
+            images = images[:, x:x+crop_width, y:y+crop_height]
+            return images
+        
+        def rescale(images, rescale_factor):
+            images = images * rescale_factor
+            return images
+        
+        def normalize(images, mean, std):
+            mean = torch.tensor(mean)
+            std = torch.tensor(std)
+            images = (images - mean) / std
+            return images
+
+        images = torch.from_numpy(images).float()
+
+        if do_resize:
+            output_size = get_resize_output_image_size(images[0], size=size["shortest_edge"], default_to_square=False)
+            images = resize(images, output_size)
+        
+        if do_center_crop:
+            images = center_crop(images, crop_size)
+        
+        if do_rescale:
+            images = rescale(images, rescale_factor)
+        
+        if do_normalize:
+            images = normalize(images, image_mean, image_std)
+
+        images = images.permute((0, 3, 1, 2))
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/minigemini/serve/__init__.py b/minigemini/serve/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/minigemini/serve/cli.py b/minigemini/serve/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff30cfec4d056c478020690760bafda9390ebae
--- /dev/null
+++ b/minigemini/serve/cli.py
@@ -0,0 +1,237 @@
+import argparse
+import torch
+
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from minigemini.conversation import conv_templates, SeparatorStyle
+from minigemini.model.builder import load_pretrained_model
+from minigemini.utils import disable_torch_init
+from minigemini.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
+
+from PIL import Image
+
+import requests
+from PIL import Image
+from io import BytesIO
+from transformers import TextStreamer
+try:
+    from diffusers import StableDiffusionXLPipeline
+except:
+    print('please install diffusers==0.26.3')
+
+try:
+    from paddleocr import PaddleOCR
+except:
+    print('please install paddleocr following https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/README_en.md')
+
+
+def load_image(image_file):
+    if image_file.startswith('http://') or image_file.startswith('https://'):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_file).convert('RGB')
+    return image
+
+
+def main(args):
+    # Model
+    disable_torch_init()
+    
+    if args.ocr and args.image_file is not None:
+        ocr = PaddleOCR(use_angle_cls=True, use_gpu=True, lang="ch")
+        result = ocr.ocr(args.image_file)   
+        str_in_image = ''
+        if result[0] is not None:
+            result = [res[1][0] for res in result[0] if res[1][1] > 0.1]
+            if len(result) > 0:
+                str_in_image = ', '.join(result)
+                print('OCR Token: ' + str_in_image)
+    
+    if args.gen:
+        pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
+        ).to("cuda")
+
+    model_name = get_model_name_from_path(args.model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
+    
+    if '8x7b' in model_name.lower():
+        conv_mode = "mistral_instruct"
+    elif '34b' in model_name.lower():
+        conv_mode = "chatml_direct"
+    elif '2b' in model_name.lower():
+        conv_mode = "gemma"
+    else:
+        conv_mode = "vicuna_v1"
+
+    if args.conv_mode is not None and conv_mode != args.conv_mode:
+        print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
+    else:
+        args.conv_mode = conv_mode
+
+    conv = conv_templates[args.conv_mode].copy()
+    if "mpt" in model_name.lower():
+        roles = ('user', 'assistant')
+    else:
+        roles = conv.roles
+
+    if args.image_file is not None:
+        images = []
+        if ',' in args.image_file:
+            images = args.image_file.split(',')
+        else:
+            images = [args.image_file]
+        
+        image_convert = []
+        for _image in images:
+            image_convert.append(load_image(_image))
+    
+        if hasattr(model.config, 'image_size_aux'):
+            if not hasattr(image_processor, 'image_size_raw'):
+                image_processor.image_size_raw = image_processor.crop_size.copy()
+            image_processor.crop_size['height'] = model.config.image_size_aux
+            image_processor.crop_size['width'] = model.config.image_size_aux
+            image_processor.size['shortest_edge'] = model.config.image_size_aux
+        
+        # Similar operation in model_worker.py
+        image_tensor = process_images(image_convert, image_processor, model.config)
+    
+        image_grid = getattr(model.config, 'image_grid', 1)
+        if hasattr(model.config, 'image_size_aux'):
+            raw_shape = [image_processor.image_size_raw['height'] * image_grid,
+                        image_processor.image_size_raw['width'] * image_grid]
+            image_tensor_aux = image_tensor 
+            image_tensor = torch.nn.functional.interpolate(image_tensor,
+                                                        size=raw_shape,
+                                                        mode='bilinear',
+                                                        align_corners=False)
+        else:
+            image_tensor_aux = []
+
+        if image_grid >= 2:            
+            raw_image = image_tensor.reshape(3, 
+                                            image_grid,
+                                            image_processor.image_size_raw['height'],
+                                            image_grid,
+                                            image_processor.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                        image_processor.image_size_raw['height'],
+                                        image_processor.image_size_raw['width'])
+                    
+            if getattr(model.config, 'image_global', False):
+                global_image = image_tensor
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                            size=[image_processor.image_size_raw['height'],
+                                                                    image_processor.image_size_raw['width']], 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            image_tensor = raw_image.contiguous()
+            image_tensor = image_tensor.unsqueeze(0)
+    
+        if type(image_tensor) is list:
+            image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
+            image_tensor_aux = [image.to(model.device, dtype=torch.float16) for image in image_tensor_aux]
+        else:
+            image_tensor = image_tensor.to(model.device, dtype=torch.float16)
+            image_tensor_aux = image_tensor_aux.to(model.device, dtype=torch.float16)
+    else:
+        images = None
+        image_tensor = None
+        image_tensor_aux = []
+
+
+    while True:
+        try:
+            inp = input(f"{roles[0]}: ")
+        except EOFError:
+            inp = ""
+        if not inp:
+            print("exit...")
+            break
+
+        print(f"{roles[1]}: ", end="")
+
+        if args.ocr and len(str_in_image) > 0:
+            inp = inp + '\nReference OCR Token: ' + str_in_image + '\n'
+        if args.gen:
+            inp = inp + ' <GEN>'
+        # print(inp, '====')
+
+        if images is not None:
+            # first message
+            if model.config.mm_use_im_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
+            else:
+                inp = (DEFAULT_IMAGE_TOKEN + '\n')*len(images) + inp
+            conv.append_message(conv.roles[0], inp)
+            images = None
+        else:
+            # later messages
+            conv.append_message(conv.roles[0], inp)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        
+        # add image split string
+        if prompt.count(DEFAULT_IMAGE_TOKEN) >= 2:
+            final_str = ''
+            sent_split = prompt.split(DEFAULT_IMAGE_TOKEN)
+            for _idx, _sub_sent in enumerate(sent_split):
+                if _idx == len(sent_split) - 1:
+                    final_str = final_str + _sub_sent
+                else:
+                    final_str = final_str + _sub_sent + f'Image {_idx+1}:' + DEFAULT_IMAGE_TOKEN
+            prompt = final_str
+        
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
+        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+        with torch.inference_mode():
+            output_ids = model.generate(
+                input_ids,
+                images=image_tensor,
+                images_aux=image_tensor_aux if len(image_tensor_aux)>0 else None,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=args.max_new_tokens,
+                bos_token_id=tokenizer.bos_token_id,  # Begin of sequence token
+                eos_token_id=tokenizer.eos_token_id,  # End of sequence token
+                pad_token_id=tokenizer.pad_token_id,  # Pad token
+                streamer=streamer,
+                use_cache=True)
+
+        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+        conv.messages[-1][-1] = outputs
+        
+        if args.gen and '<h>' in outputs and '</h>' in outputs:
+            common_neg_prompt = "out of frame, lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+            prompt = outputs.split("</h>")[-2].split("<h>")[-1]
+            output_img = pipe(prompt, negative_prompt=common_neg_prompt).images[0]
+            output_img.save(args.output_file)
+            print(f'Generate an image, save at {args.output_file}')
+
+        if args.debug:
+            print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-file", type=str, default=None) # file_0.jpg,file_1.jpg for multi image
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--conv-mode", type=str, default=None)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--max-new-tokens", type=int, default=512)
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--ocr", action="store_true")
+    parser.add_argument("--gen", action="store_true")
+    parser.add_argument("--output-file", type=str, default='generate.png')
+    parser.add_argument("--debug", action="store_true")
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/minigemini/serve/controller.py b/minigemini/serve/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d39b4dbd2cf2dc3bfc144e0aad96b458447f26c
--- /dev/null
+++ b/minigemini/serve/controller.py
@@ -0,0 +1,298 @@
+"""
+A controller manages distributed workers.
+It sends worker addresses to clients.
+"""
+import argparse
+import asyncio
+import dataclasses
+from enum import Enum, auto
+import json
+import logging
+import time
+from typing import List, Union
+import threading
+
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+import numpy as np
+import requests
+import uvicorn
+
+from minigemini.constants import CONTROLLER_HEART_BEAT_EXPIRATION
+from minigemini.utils import build_logger, server_error_msg
+
+
+logger = build_logger("controller", "controller.log")
+
+
+class DispatchMethod(Enum):
+    LOTTERY = auto()
+    SHORTEST_QUEUE = auto()
+
+    @classmethod
+    def from_str(cls, name):
+        if name == "lottery":
+            return cls.LOTTERY
+        elif name == "shortest_queue":
+            return cls.SHORTEST_QUEUE
+        else:
+            raise ValueError(f"Invalid dispatch method")
+
+
+@dataclasses.dataclass
+class WorkerInfo:
+    model_names: List[str]
+    speed: int
+    queue_length: int
+    check_heart_beat: bool
+    last_heart_beat: str
+
+
+def heart_beat_controller(controller):
+    while True:
+        time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
+        controller.remove_stable_workers_by_expiration()
+
+
+class Controller:
+    def __init__(self, dispatch_method: str):
+        # Dict[str -> WorkerInfo]
+        self.worker_info = {}
+        self.dispatch_method = DispatchMethod.from_str(dispatch_method)
+
+        self.heart_beat_thread = threading.Thread(
+            target=heart_beat_controller, args=(self,))
+        self.heart_beat_thread.start()
+
+        logger.info("Init controller")
+
+    def register_worker(self, worker_name: str, check_heart_beat: bool,
+                        worker_status: dict):
+        if worker_name not in self.worker_info:
+            logger.info(f"Register a new worker: {worker_name}")
+        else:
+            logger.info(f"Register an existing worker: {worker_name}")
+
+        if not worker_status:
+            worker_status = self.get_worker_status(worker_name)
+        if not worker_status:
+            return False
+
+        self.worker_info[worker_name] = WorkerInfo(
+            worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
+            check_heart_beat, time.time())
+
+        logger.info(f"Register done: {worker_name}, {worker_status}")
+        return True
+
+    def get_worker_status(self, worker_name: str):
+        try:
+            r = requests.post(worker_name + "/worker_get_status", timeout=5)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Get status fails: {worker_name}, {e}")
+            return None
+
+        if r.status_code != 200:
+            logger.error(f"Get status fails: {worker_name}, {r}")
+            return None
+
+        return r.json()
+
+    def remove_worker(self, worker_name: str):
+        del self.worker_info[worker_name]
+
+    def refresh_all_workers(self):
+        old_info = dict(self.worker_info)
+        self.worker_info = {}
+
+        for w_name, w_info in old_info.items():
+            if not self.register_worker(w_name, w_info.check_heart_beat, None):
+                logger.info(f"Remove stale worker: {w_name}")
+
+    def list_models(self):
+        model_names = set()
+
+        for w_name, w_info in self.worker_info.items():
+            model_names.update(w_info.model_names)
+
+        return list(model_names)
+
+    def get_worker_address(self, model_name: str):
+        if self.dispatch_method == DispatchMethod.LOTTERY:
+            worker_names = []
+            worker_speeds = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_speeds.append(w_info.speed)
+            worker_speeds = np.array(worker_speeds, dtype=np.float32)
+            norm = np.sum(worker_speeds)
+            if norm < 1e-4:
+                return ""
+            worker_speeds = worker_speeds / norm
+            if True:  # Directly return address
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+                return worker_name
+
+            # Check status before returning
+            while True:
+                pt = np.random.choice(np.arange(len(worker_names)),
+                    p=worker_speeds)
+                worker_name = worker_names[pt]
+
+                if self.get_worker_status(worker_name):
+                    break
+                else:
+                    self.remove_worker(worker_name)
+                    worker_speeds[pt] = 0
+                    norm = np.sum(worker_speeds)
+                    if norm < 1e-4:
+                        return ""
+                    worker_speeds = worker_speeds / norm
+                    continue
+            return worker_name
+        elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
+            worker_names = []
+            worker_qlen = []
+            for w_name, w_info in self.worker_info.items():
+                if model_name in w_info.model_names:
+                    worker_names.append(w_name)
+                    worker_qlen.append(w_info.queue_length / w_info.speed)
+            if len(worker_names) == 0:
+                return ""
+            min_index = np.argmin(worker_qlen)
+            w_name = worker_names[min_index]
+            self.worker_info[w_name].queue_length += 1
+            logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
+            return w_name
+        else:
+            raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
+
+    def receive_heart_beat(self, worker_name: str, queue_length: int):
+        if worker_name not in self.worker_info:
+            logger.info(f"Receive unknown heart beat. {worker_name}")
+            return False
+
+        self.worker_info[worker_name].queue_length = queue_length
+        self.worker_info[worker_name].last_heart_beat = time.time()
+        logger.info(f"Receive heart beat. {worker_name}")
+        return True
+
+    def remove_stable_workers_by_expiration(self):
+        expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
+        to_delete = []
+        for worker_name, w_info in self.worker_info.items():
+            if w_info.check_heart_beat and w_info.last_heart_beat < expire:
+                to_delete.append(worker_name)
+
+        for worker_name in to_delete:
+            self.remove_worker(worker_name)
+
+    def worker_api_generate_stream(self, params):
+        worker_addr = self.get_worker_address(params["model"])
+        if not worker_addr:
+            logger.info(f"no worker: {params['model']}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 2,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+        try:
+            response = requests.post(worker_addr + "/worker_generate_stream",
+                json=params, stream=True, timeout=5)
+            for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+                if chunk:
+                    yield chunk + b"\0"
+        except requests.exceptions.RequestException as e:
+            logger.info(f"worker timeout: {worker_addr}")
+            ret = {
+                "text": server_error_msg,
+                "error_code": 3,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+    # Let the controller act as a worker to achieve hierarchical
+    # management. This can be used to connect isolated sub networks.
+    def worker_api_get_status(self):
+        model_names = set()
+        speed = 0
+        queue_length = 0
+
+        for w_name in self.worker_info:
+            worker_status = self.get_worker_status(w_name)
+            if worker_status is not None:
+                model_names.update(worker_status["model_names"])
+                speed += worker_status["speed"]
+                queue_length += worker_status["queue_length"]
+
+        return {
+            "model_names": list(model_names),
+            "speed": speed,
+            "queue_length": queue_length,
+        }
+
+
+app = FastAPI()
+
+
+@app.post("/register_worker")
+async def register_worker(request: Request):
+    data = await request.json()
+    controller.register_worker(
+        data["worker_name"], data["check_heart_beat"],
+        data.get("worker_status", None))
+
+
+@app.post("/refresh_all_workers")
+async def refresh_all_workers():
+    models = controller.refresh_all_workers()
+
+
+@app.post("/list_models")
+async def list_models():
+    models = controller.list_models()
+    return {"models": models}
+
+
+@app.post("/get_worker_address")
+async def get_worker_address(request: Request):
+    data = await request.json()
+    addr = controller.get_worker_address(data["model"])
+    return {"address": addr}
+
+
+@app.post("/receive_heart_beat")
+async def receive_heart_beat(request: Request):
+    data = await request.json()
+    exist = controller.receive_heart_beat(
+        data["worker_name"], data["queue_length"])
+    return {"exist": exist}
+
+
+@app.post("/worker_generate_stream")
+async def worker_api_generate_stream(request: Request):
+    params = await request.json()
+    generator = controller.worker_api_generate_stream(params)
+    return StreamingResponse(generator)
+
+
+@app.post("/worker_get_status")
+async def worker_api_get_status(request: Request):
+    return controller.worker_api_get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21001)
+    parser.add_argument("--dispatch-method", type=str, choices=[
+        "lottery", "shortest_queue"], default="shortest_queue")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    controller = Controller(args.dispatch_method)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
diff --git a/minigemini/serve/examples/extreme_ironing.jpg b/minigemini/serve/examples/extreme_ironing.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cf1071a1fbfa904309335e3521cecbcec341b37f
--- /dev/null
+++ b/minigemini/serve/examples/extreme_ironing.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a54caa21bc513ed25c8ca7f5747555c05dfd4e33f6a3cf5c08b3d9138a4da1d9
+size 62587
diff --git a/minigemini/serve/examples/monday.jpg b/minigemini/serve/examples/monday.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dea47cdf51294b26d684d53e07290ba0bddc8db1
--- /dev/null
+++ b/minigemini/serve/examples/monday.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f516b74860919074ea7bd855c2073a565283cf0f888139841f60512655996066
+size 7143
diff --git a/minigemini/serve/examples/waterview.jpg b/minigemini/serve/examples/waterview.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5ea03ee6fa60f4025999012b817e674984c706cd
--- /dev/null
+++ b/minigemini/serve/examples/waterview.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d092764cc9f21b9bc535ff5284b5add4d8256148bab1bc2f5b5ab3fd32759a36
+size 95499
diff --git a/minigemini/serve/examples/woolen.png b/minigemini/serve/examples/woolen.png
new file mode 100644
index 0000000000000000000000000000000000000000..38483ba00bfb56e85509c45fce80d601b5686ae6
--- /dev/null
+++ b/minigemini/serve/examples/woolen.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb303bfbac3cdb104972daf87e4d0515d08f370897d361f63a1407f302f98a9f
+size 1410296
diff --git a/minigemini/serve/gradio_web_server.py b/minigemini/serve/gradio_web_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..c57794ba0c1f39c781d8efb2231df95cac5a8153
--- /dev/null
+++ b/minigemini/serve/gradio_web_server.py
@@ -0,0 +1,486 @@
+import argparse
+import datetime
+import json
+import os
+import time
+
+import gradio as gr
+import requests
+
+from minigemini.conversation import (default_conversation, conv_templates,
+                                        SeparatorStyle)
+from minigemini.constants import LOGDIR
+from minigemini.utils import (build_logger, server_error_msg,
+    violates_moderation, moderation_msg)
+import hashlib
+
+
+logger = build_logger("gradio_web_server", "gradio_web_server.log")
+
+headers = {"User-Agent": "Mini-Gemini Client"}
+
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+
+priority = {
+    "vicuna-13b": "aaaaaaa",
+    "koala-13b": "aaaaaab",
+}
+
+
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+
+
+def get_model_list():
+    ret = requests.post(args.controller_url + "/refresh_all_workers")
+    assert ret.status_code == 200
+    ret = requests.post(args.controller_url + "/list_models")
+    models = ret.json()["models"]
+    models.sort(key=lambda x: priority.get(x, x))
+    logger.info(f"Models: {models}")
+    return models
+
+
+get_window_url_params = """
+function() {
+    const params = new URLSearchParams(window.location.search);
+    url_params = Object.fromEntries(params);
+    console.log(url_params);
+    return url_params;
+    }
+"""
+
+
+def load_demo(url_params, request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
+
+    dropdown_update = gr.Dropdown(visible=True)
+    if "model" in url_params:
+        model = url_params["model"]
+        if model in models:
+            dropdown_update = gr.Dropdown(value=model, visible=True)
+
+    state = default_conversation.copy()
+    return state, dropdown_update
+
+
+def load_demo_refresh_model_list(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    models = get_model_list()
+    state = default_conversation.copy()
+    dropdown_update = gr.Dropdown(
+        choices=models,
+        value=models[0] if len(models) > 0 else ""
+    )
+    return state, dropdown_update
+
+
+def vote_last_response(state, vote_type, model_selector, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "model": model_selector,
+            "state": state.dict(),
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+
+def upvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"upvote. ip: {request.client.host}")
+    vote_last_response(state, "upvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def downvote_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"downvote. ip: {request.client.host}")
+    vote_last_response(state, "downvote", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def flag_last_response(state, model_selector, request: gr.Request):
+    logger.info(f"flag. ip: {request.client.host}")
+    vote_last_response(state, "flag", model_selector, request)
+    return ("",) + (disable_btn,) * 3
+
+
+def regenerate(state, image_process_mode, request: gr.Request):
+    logger.info(f"regenerate. ip: {request.client.host}")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def add_text(state, text, image, image_process_mode, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
+    if len(text) <= 0 and image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
+    if args.moderate:
+        flagged = violates_moderation(text)
+        if flagged:
+            state.skip_next = True
+            return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
+                no_change_btn,) * 5
+
+    text = text[:1536]  # Hard cut-off
+    if image is not None:
+        text = text[:1200]  # Hard cut-off for images
+        if '<image>' not in text:
+            # text = '<Image><image></Image>' + text
+            text = text + '\n<image>'
+        text = (text, image, image_process_mode)
+        if len(state.get_images(return_pil=True)) > 0:
+            state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.append_message(state.roles[1], None)
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
+
+
+def http_bot(state, model_selector, temperature, top_p, max_new_tokens, gen_image, use_ocr, request: gr.Request):
+    logger.info(f"http_bot. ip: {request.client.host}")
+    start_tstamp = time.time()
+    model_name = model_selector
+
+    if state.skip_next:
+        # This generate call is skipped due to invalid inputs
+        yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
+        return
+
+    if len(state.messages) == state.offset + 2:
+        # First round of conversation
+        if "mini-gemini" in model_name.lower():
+            if '8x7b' in model_name.lower():
+                template_name = "mistral_instruct"
+            elif '34b' in model_name.lower():
+                template_name = "chatml_direct"
+            elif '2b' in model_name.lower():
+                template_name = "gemma"
+            else:
+                template_name = "vicuna_v1"
+        else:
+            template_name = "vicuna_v1"
+
+        new_state = conv_templates[template_name].copy()
+        new_state.append_message(new_state.roles[0], state.messages[-2][1])
+        new_state.append_message(new_state.roles[1], None)
+        state = new_state
+
+    # Query worker address
+    controller_url = args.controller_url
+    ret = requests.post(controller_url + "/get_worker_address",
+            json={"model": model_name})
+    worker_addr = ret.json()["address"]
+    logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
+
+    # No available worker
+    if worker_addr == "":
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    # Construct prompt
+    prompt = state.get_prompt()
+
+    all_images = state.get_images(return_pil=True)
+    all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
+    for image, hash in zip(all_images, all_image_hash):
+        t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
+        if not os.path.isfile(filename):
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            image.save(filename)
+    
+    # Generate Image
+    if 'generate' in prompt.lower():
+        gen_image = 'Yes'
+    elif 'show me one idea of what i could make with this?' in prompt.lower() and len(all_images) == 1:
+        h, w = all_images[0].size
+        if h == 922 and w == 672:
+            gen_image = 'Yes'
+
+    # Make requests
+    pload = {
+        "model": model_name,
+        "prompt": prompt,
+        "temperature": float(temperature),
+        "top_p": float(top_p),
+        "max_new_tokens": min(int(max_new_tokens), 1536),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
+        "images": f'List of {len(state.get_images())} images: {all_image_hash}',
+        "gen_image": bool(gen_image == 'Yes'),
+        "use_ocr": bool(use_ocr == 'Yes'),
+    }
+    logger.info(f"==== request ====\n{pload}")
+
+    pload['images'] = state.get_images()
+
+    state.messages[-1][-1] = "▌"
+    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+
+    try:
+        # Stream output
+        response = requests.post(worker_addr + "/worker_generate_stream",
+            headers=headers, json=pload, stream=True, timeout=30)
+        for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
+            if chunk:
+                data = json.loads(chunk.decode())
+                if data["error_code"] == 0:
+                    if 'image' not in data.keys():
+                        output = data["text"][len(prompt):].strip()
+                        state.messages[-1][-1] = output + "▌"
+                    else:
+                        output = (data["text"][len(prompt):].strip(), data["image"])
+                        state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
+                else:
+                    output = data["text"] + f" (error_code: {data['error_code']})"
+                    state.messages[-1][-1] = output
+                    yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+                    return
+                time.sleep(0.03)
+    except requests.exceptions.RequestException as e:
+        state.messages[-1][-1] = server_error_msg
+        yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
+        return
+
+    if type(state.messages[-1][-1]) is not tuple:
+        state.messages[-1][-1] = state.messages[-1][-1][:-1]
+    yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
+
+    finish_tstamp = time.time()
+    logger.info(f"{output}")
+
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(finish_tstamp, 4),
+            "type": "chat",
+            "model": model_name,
+            "start": round(start_tstamp, 4),
+            "finish": round(finish_tstamp, 4),
+            "state": state.dict(),
+            "images": all_image_hash,
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+
+title_markdown = ("""
+# Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models
+[[Project Page]](https://mini-gemini.github.io/) [[Paper]](https://arxiv.org/abs/2403.18814) [[Code]](https://github.com/dvlab-research/MiniGemini) [[Model]](https://huggingface.co/collections/YanweiLi/mini-gemini-6603c50b9b43d044171d0854) [[Data]](https://huggingface.co/collections/YanweiLi/mini-gemini-data-660463ea895a01d8f367624e) <br>
+This is Mini-Gemini-13B-HD version. The Mini-Gemini-34B-HD is deployed on [[here]](http://10.81.134.110:7860/)
+""")
+
+function_markdown = ("""
+### Function
+If you want to generate an image in conversation, please turn on 'Generate Image'. <br>
+""")
+
+tos_markdown = ("""
+### Terms of use
+By using this service, users are required to agree to the following terms:
+The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
+For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
+""")
+
+
+learn_more_markdown = ("""
+### License
+The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
+""")
+
+block_css = """
+
+#buttons button {
+    min-width: min(120px,100%);
+}
+
+"""
+
+def build_demo(embed_mode, cur_dir=None, concurrency_count=10):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
+    with gr.Blocks(title="Mini-Gemini", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+
+        if not embed_mode:
+            gr.Markdown(title_markdown)
+
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row(elem_id="model_selector_row"):
+                    model_selector = gr.Dropdown(
+                        choices=models,
+                        value=models[0] if len(models) > 0 else "",
+                        interactive=True,
+                        show_label=False,
+                        container=False)
+
+                imagebox = gr.Image(type="pil")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+
+                if cur_dir is None:
+                    cur_dir = os.path.dirname(os.path.abspath(__file__))
+                gr.Examples(examples=[
+                    [f"{cur_dir}/examples/monday.jpg", "Explain why this meme is funny, and generate a picture when the weekend coming."],
+                    [f"{cur_dir}/examples/woolen.png", "Show me one idea of what I could make with this?"],
+                    [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
+                    [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
+                ], inputs=[imagebox, textbox])
+
+                with gr.Accordion("Function", open=True) as parameter_row:
+                    gen_image = gr.Radio(choices=['Yes', 'No'], value='No', interactive=True, label="Generate Image")
+                    use_ocr = gr.Radio(choices=['Yes', 'No'], value='Yes', interactive=True, label="Use OCR")
+
+                with gr.Accordion("Parameters", open=False) as parameter_row:
+                    temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
+                    top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
+                    max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
+
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="Mini-Gemini Chatbot",
+                    height=940,
+                    layout="panel",
+                )
+                with gr.Row():
+                    with gr.Column(scale=7):
+                        textbox.render()
+                    with gr.Column(scale=1, min_width=50):
+                        submit_btn = gr.Button(value="Send", variant="primary")
+                with gr.Row(elem_id="buttons") as button_row:
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    flag_btn = gr.Button(value="⚠️  Flag", interactive=False)
+                    #stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+
+        if not embed_mode:
+            gr.Markdown(function_markdown)
+            gr.Markdown(tos_markdown)
+            gr.Markdown(learn_more_markdown)
+        url_params = gr.JSON(visible=False)
+
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(
+            upvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        downvote_btn.click(
+            downvote_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+        flag_btn.click(
+            flag_last_response,
+            [state, model_selector],
+            [textbox, upvote_btn, downvote_btn, flag_btn]
+        )
+
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens, gen_image, use_ocr],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        )
+
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens, gen_image, use_ocr],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state, model_selector, temperature, top_p, max_output_tokens, gen_image, use_ocr],
+            [state, chatbot] + btn_list,
+            concurrency_limit=concurrency_count
+        )
+
+        if args.model_list_mode == "once":
+            demo.load(
+                load_demo,
+                [url_params],
+                [state, model_selector],
+                _js=get_window_url_params
+            )
+        elif args.model_list_mode == "reload":
+            demo.load(
+                load_demo_refresh_model_list,
+                None,
+                [state, model_selector],
+                queue=False
+            )
+        else:
+            raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
+
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
+    parser.add_argument("--concurrency-count", type=int, default=16)
+    parser.add_argument("--model-list-mode", type=str, default="once",
+        choices=["once", "reload"])
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--moderate", action="store_true")
+    parser.add_argument("--embed", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    models = get_model_list()
+
+    logger.info(args)
+    demo = build_demo(args.embed, concurrency_count=args.concurrency_count)
+    demo.queue(
+        api_open=False
+    ).launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share
+    )
\ No newline at end of file
diff --git a/minigemini/serve/model_worker.py b/minigemini/serve/model_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f73c5dca767ef25b773609fad6a558ba95831dfe
--- /dev/null
+++ b/minigemini/serve/model_worker.py
@@ -0,0 +1,399 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import torch
+import uvicorn
+from functools import partial
+
+from minigemini.constants import WORKER_HEART_BEAT_INTERVAL
+from minigemini.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from minigemini.model.builder import load_pretrained_model
+from minigemini.mm_utils import process_images, load_image_from_base64, tokenizer_image_token
+from minigemini.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from transformers import TextIteratorStreamer
+from threading import Thread
+try:
+    from diffusers import StableDiffusionXLPipeline
+except:
+    print('please install diffusers==0.26.3')
+
+try:
+    from paddleocr import PaddleOCR
+except:
+    print('please install paddleocr following https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/README_en.md')
+
+import io
+import base64
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr,
+                 worker_id, no_register,
+                 model_path, model_base, model_name,
+                 load_8bit, load_4bit, device, use_flash_attn=False):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        self.device = device
+        logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
+        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
+            model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device, use_flash_attn=use_flash_attn)
+        # self.is_multimodal = 'llava' in self.model_name.lower()
+        self.is_multimodal = True
+
+        if hasattr(self.model.config, 'image_size_aux'):
+            if not hasattr(self.image_processor, 'image_size_raw'):
+                self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
+            self.image_processor.crop_size['height'] = self.model.config.image_size_aux
+            self.image_processor.crop_size['width'] = self.model.config.image_size_aux
+            self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
+
+        # ocr model
+        self.ocr_model = PaddleOCR(use_angle_cls=True, use_gpu=True, lang="ch")
+
+        # diffusion model
+        max_gpu_index = torch.cuda.device_count() - 1
+        device_last = torch.device(f'cuda:{max_gpu_index}')
+        print(torch.cuda.device_count(), '++++++', device_last)
+        self.pipe = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
+        ).to(device=device_last)
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,))
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=30)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+    
+    def add_content(self, prompt, new_content):
+        if '[INST]' in prompt:
+            split_index = prompt.rfind(' [/INST]')
+        elif '<|im_end|>' in prompt:
+            split_index = prompt.rfind('<|im_end|>')
+        else:
+            split_index = prompt.rfind('###Assistant:')
+        left_prompt = prompt[:split_index]
+        right_prompt = prompt[split_index:]
+        prompt = left_prompt + new_content + right_prompt
+        return prompt
+
+    @torch.inference_mode()
+    def generate_stream(self, params):
+        tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
+        prompt = params["prompt"]
+        ori_prompt = prompt
+        images = params.get("images", None)
+        gen_image = params.get("gen_image", False)
+        use_ocr = params.get("use_ocr", False)
+        num_image_tokens = 0
+
+        if gen_image:
+            prompt = self.add_content(prompt, ' <GEN>')
+        print(prompt)
+
+        if images is not None and len(images) > 0 and self.is_multimodal:  # len(images) = 1
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+
+                # add OCR tokens
+                if use_ocr:
+                    str_in_image = ''
+                    for image in images:
+                        img_byte_arr = io.BytesIO()
+                        image.save(img_byte_arr, format=image.format)
+                        img_byte_arr = img_byte_arr.getvalue()
+                        result = self.ocr_model.ocr(img_byte_arr, cls=True) 
+                        
+                        if result[0] is not None:
+                            result = [res[1][0] for res in result[0] if res[1][1] > 0.1]
+                            if len(result) > 0:
+                                str_in_image += ', '.join(result)
+                    # print('OCR Token: ' + str_in_image)
+                    if len(str_in_image) > 0:
+                        prompt = self.add_content(prompt, '\nReference OCR Token: ' + str_in_image + '\n')
+
+                image_tensor = process_images(images, image_processor, model.config)
+
+                image_grid = getattr(model.config, 'image_grid', 1)
+                if hasattr(model.config, 'image_size_aux'):
+                    raw_shape = [image_processor.image_size_raw['height'] * image_grid,
+                                 image_processor.image_size_raw['width'] * image_grid]
+                    image_tensor_aux = image_tensor 
+                    image_tensor = torch.nn.functional.interpolate(image_tensor,
+                                                                   size=raw_shape,
+                                                                   mode='bilinear',
+                                                                   align_corners=False) # # torch.Size([1, 3, 336, 336])
+                else:
+                    image_tensor_aux = []
+
+                if image_grid >= 2:            
+                    raw_image = image_tensor.reshape(3, 
+                                                    image_grid,
+                                                    image_processor.image_size_raw['height'],
+                                                    image_grid,
+                                                    image_processor.image_size_raw['width'])
+                    raw_image = raw_image.permute(1, 3, 0, 2, 4)
+                    raw_image = raw_image.reshape(-1, 3,
+                                                image_processor.image_size_raw['height'],
+                                                image_processor.image_size_raw['width'])
+                
+                    if getattr(model.config, 'image_global', False):
+                        global_image = image_tensor
+                        if len(global_image.shape) == 3:
+                            global_image = global_image[None]
+                        global_image = torch.nn.functional.interpolate(global_image, 
+                                                                size=[image_processor.image_size_raw['height'],
+                                                                    image_processor.image_size_raw['width']], 
+                                                                mode='bilinear', 
+                                                                align_corners=False)
+                        # [image_crops, image_global]
+                        raw_image = torch.cat([raw_image, global_image], dim=0)
+                    image_tensor = raw_image.contiguous()
+
+                image_tensor = image_tensor.to(self.model.device, dtype=torch.float16).unsqueeze(0)
+                image_tensor_aux = image_tensor_aux.to(self.model.device, dtype=torch.float16)
+
+                replace_token = DEFAULT_IMAGE_TOKEN
+                if getattr(self.model.config, 'mm_use_im_start_end', False):
+                    replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+                num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
+            else:
+                image_tensor = None
+            image_args = {"images": image_tensor, "images_aux": image_tensor_aux}
+        else:
+            image_tensor = None
+            image_args = {}
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        do_sample = True if temperature > 0.001 else False
+
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=30)
+
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
+
+        if max_new_tokens < 1:
+            yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
+            return
+
+        thread = Thread(target=model.generate, kwargs=dict(
+            inputs=input_ids,
+            do_sample=do_sample,
+            temperature=temperature,
+            top_p=top_p,
+            max_new_tokens=max_new_tokens,
+            streamer=streamer,
+            use_cache=True,
+            **image_args
+        ))
+        thread.start()
+
+        generated_text = ori_prompt
+        for new_text in streamer:
+            generated_text += new_text
+            if generated_text.endswith(stop_str):
+                generated_text = generated_text[:-len(stop_str)]
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+        torch.cuda.empty_cache()
+
+        if gen_image and "<h>" in generated_text and "</h>" in generated_text:
+            # common_neg_prompt = "blur, lowres, bad anatomy, bad hands, cropped, worst quality"
+            common_neg_prompt = "out of frame, lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+            prompt = generated_text.split("<h>")[1].split("</h>")[0]
+            # yield json.dumps({"text": prompt, "error_code": 0}).encode() + b"\0"
+
+            output_img = self.pipe(prompt, negative_prompt=common_neg_prompt).images[0]
+            buffered = io.BytesIO()
+            output_img.save(buffered, format='JPEG')
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            torch.cuda.empty_cache()
+
+            generated_text = generated_text.split("<h>")[0] + '\n' + 'Prompt: ' + prompt + '\n'
+            yield json.dumps({"text": generated_text, "image": img_b64_str, "error_code": 0}).encode() + b"\0"
+
+    def generate_stream_gate(self, params):
+        try:
+            for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except torch.cuda.CudaError as e:
+            print("Caught torch.cuda.CudaError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    parser.add_argument("--load-8bit", action="store_true")
+    parser.add_argument("--load-4bit", action="store_true")
+    parser.add_argument("--use-flash-attn", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    if args.multi_modal:
+        logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         worker_id,
+                         args.no_register,
+                         args.model_path,
+                         args.model_base,
+                         args.model_name,
+                         args.load_8bit,
+                         args.load_4bit,
+                         args.device,
+                         use_flash_attn=args.use_flash_attn)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
\ No newline at end of file
diff --git a/minigemini/serve/register_worker.py b/minigemini/serve/register_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c2c40295e0351f25709ba25554c9329f15bf0d2
--- /dev/null
+++ b/minigemini/serve/register_worker.py
@@ -0,0 +1,26 @@
+"""
+Manually register workers.
+
+Usage:
+python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
+"""
+
+import argparse
+
+import requests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str)
+    parser.add_argument("--worker-name", type=str)
+    parser.add_argument("--check-heart-beat", action="store_true")
+    args = parser.parse_args()
+
+    url = args.controller_address + "/register_worker"
+    data = {
+        "worker_name": args.worker_name,
+        "check_heart_beat": args.check_heart_beat,
+        "worker_status": None,
+    }
+    r = requests.post(url, json=data)
+    assert r.status_code == 200
diff --git a/minigemini/serve/sglang_worker.py b/minigemini/serve/sglang_worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9146dd0906a6a4237552e19e4a1d332bfbe0f2
--- /dev/null
+++ b/minigemini/serve/sglang_worker.py
@@ -0,0 +1,244 @@
+"""
+A model worker executes the model.
+"""
+import argparse
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+import json
+import time
+import threading
+import uuid
+
+from fastapi import FastAPI, Request, BackgroundTasks
+from fastapi.responses import StreamingResponse
+import requests
+import re
+import uvicorn
+from functools import partial
+
+from minigemini.constants import WORKER_HEART_BEAT_INTERVAL
+from minigemini.utils import (build_logger, server_error_msg,
+    pretty_print_semaphore)
+from minigemini.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
+from minigemini.constants import DEFAULT_IMAGE_TOKEN
+
+import sglang as sgl
+from sglang.backend.runtime_endpoint import RuntimeEndpoint
+
+
+GB = 1 << 30
+
+worker_id = str(uuid.uuid4())[:6]
+logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+global_counter = 0
+
+model_semaphore = None
+
+
+def heart_beat_worker(controller):
+    while True:
+        time.sleep(WORKER_HEART_BEAT_INTERVAL)
+        controller.send_heart_beat()
+
+
+@sgl.function
+def pipeline(s, prompt, max_tokens):
+    for p in prompt:
+        if type(p) is str:
+            s += p
+        else:
+            s += sgl.image(p)
+    s += sgl.gen("response", max_tokens=max_tokens)
+
+
+class ModelWorker:
+    def __init__(self, controller_addr, worker_addr, sgl_endpoint,
+                 worker_id, no_register, model_name):
+        self.controller_addr = controller_addr
+        self.worker_addr = worker_addr
+        self.worker_id = worker_id
+
+        # Select backend
+        backend = RuntimeEndpoint(sgl_endpoint)
+        sgl.set_default_backend(backend)
+        model_path = backend.model_info["model_path"]
+
+        if model_path.endswith("/"):
+            model_path = model_path[:-1]
+        if model_name is None:
+            model_paths = model_path.split("/")
+            if model_paths[-1].startswith('checkpoint-'):
+                self.model_name = model_paths[-2] + "_" + model_paths[-1]
+            else:
+                self.model_name = model_paths[-1]
+        else:
+            self.model_name = model_name
+
+        logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
+
+        if not no_register:
+            self.register_to_controller()
+            self.heart_beat_thread = threading.Thread(
+                target=heart_beat_worker, args=(self,))
+            self.heart_beat_thread.start()
+
+    def register_to_controller(self):
+        logger.info("Register to controller")
+
+        url = self.controller_addr + "/register_worker"
+        data = {
+            "worker_name": self.worker_addr,
+            "check_heart_beat": True,
+            "worker_status": self.get_status()
+        }
+        r = requests.post(url, json=data)
+        assert r.status_code == 200
+
+    def send_heart_beat(self):
+        logger.info(f"Send heart beat. Models: {[self.model_name]}. "
+                    f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
+                    f"global_counter: {global_counter}")
+
+        url = self.controller_addr + "/receive_heart_beat"
+
+        while True:
+            try:
+                ret = requests.post(url, json={
+                    "worker_name": self.worker_addr,
+                    "queue_length": self.get_queue_length()}, timeout=5)
+                exist = ret.json()["exist"]
+                break
+            except requests.exceptions.RequestException as e:
+                logger.error(f"heart beat error: {e}")
+            time.sleep(5)
+
+        if not exist:
+            self.register_to_controller()
+
+    def get_queue_length(self):
+        if model_semaphore is None:
+            return 0
+        else:
+            return args.limit_model_concurrency - model_semaphore._value + (len(
+                model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
+
+    def get_status(self):
+        return {
+            "model_names": [self.model_name],
+            "speed": 1,
+            "queue_length": self.get_queue_length(),
+        }
+
+    async def generate_stream(self, params):
+        ori_prompt = prompt = params["prompt"]
+        images = params.get("images", None)
+        if images is not None and len(images) > 0:
+            if len(images) > 0:
+                if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
+
+                images = [load_image_from_base64(image) for image in images]
+
+                # FIXME: for image-start/end token
+                # replace_token = DEFAULT_IMAGE_TOKEN
+                # if getattr(self.model.config, 'mm_use_im_start_end', False):
+                #     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
+                prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
+                prompt = []
+                for i in range(len(prompt_split)):
+                    prompt.append(prompt_split[i])
+                    if i < len(images):
+                        prompt.append(images[i])
+        else:
+            prompt = [prompt]
+
+        temperature = float(params.get("temperature", 1.0))
+        top_p = float(params.get("top_p", 1.0))
+        # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
+        max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
+        stop_str = params.get("stop", None)
+        stop_str = [stop_str] if stop_str is not None else None
+
+        print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
+        state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
+
+        generated_text = ori_prompt
+        async for text_outputs in state.text_async_iter(var_name="response"):
+            generated_text += text_outputs
+            yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
+
+    async def generate_stream_gate(self, params):
+        try:
+            async for x in self.generate_stream(params):
+                yield x
+        except ValueError as e:
+            print("Caught ValueError:", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+        except Exception as e:
+            print("Caught Unknown Error", e)
+            ret = {
+                "text": server_error_msg,
+                "error_code": 1,
+            }
+            yield json.dumps(ret).encode() + b"\0"
+
+
+app = FastAPI()
+
+
+def release_model_semaphore(fn=None):
+    model_semaphore.release()
+    if fn is not None:
+        fn()
+
+
+@app.post("/worker_generate_stream")
+async def generate_stream(request: Request):
+    global model_semaphore, global_counter
+    global_counter += 1
+    params = await request.json()
+
+    if model_semaphore is None:
+        model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
+    await model_semaphore.acquire()
+    worker.send_heart_beat()
+    generator = worker.generate_stream_gate(params)
+    background_tasks = BackgroundTasks()
+    background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
+    return StreamingResponse(generator, background=background_tasks)
+
+
+@app.post("/worker_get_status")
+async def get_status(request: Request):
+    return worker.get_status()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=21002)
+    parser.add_argument("--worker-address", type=str,
+        default="http://localhost:21002")
+    parser.add_argument("--controller-address", type=str,
+        default="http://localhost:21001")
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--sgl-endpoint", type=str)
+    parser.add_argument("--limit-model-concurrency", type=int, default=5)
+    parser.add_argument("--stream-interval", type=int, default=1)
+    parser.add_argument("--no-register", action="store_true")
+    args = parser.parse_args()
+    logger.info(f"args: {args}")
+
+    worker = ModelWorker(args.controller_address,
+                         args.worker_address,
+                         args.sgl_endpoint,
+                         worker_id,
+                         args.no_register,
+                         args.model_name)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
\ No newline at end of file
diff --git a/minigemini/serve/test_message.py b/minigemini/serve/test_message.py
new file mode 100644
index 0000000000000000000000000000000000000000..af203f9ab1cc68397edfd4a57f2b7baa7d756859
--- /dev/null
+++ b/minigemini/serve/test_message.py
@@ -0,0 +1,62 @@
+import argparse
+import json
+
+import requests
+
+from minigemini.conversation import default_conversation
+
+
+def main():
+    if args.worker_address:
+        worker_addr = args.worker_address
+    else:
+        controller_addr = args.controller_address
+        ret = requests.post(controller_addr + "/refresh_all_workers")
+        ret = requests.post(controller_addr + "/list_models")
+        models = ret.json()["models"]
+        models.sort()
+        print(f"Models: {models}")
+
+        ret = requests.post(controller_addr + "/get_worker_address",
+            json={"model": args.model_name})
+        worker_addr = ret.json()["address"]
+        print(f"worker_addr: {worker_addr}")
+
+    if worker_addr == "":
+        return
+
+    conv = default_conversation.copy()
+    conv.append_message(conv.roles[0], args.message)
+    prompt = conv.get_prompt()
+
+    headers = {"User-Agent": "LLaVA Client"}
+    pload = {
+        "model": args.model_name,
+        "prompt": prompt,
+        "max_new_tokens": args.max_new_tokens,
+        "temperature": 0.7,
+        "stop": conv.sep,
+    }
+    response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
+            json=pload, stream=True)
+
+    print(prompt.replace(conv.sep, "\n"), end="")
+    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"].split(conv.sep)[-1]
+            print(output, end="\r")
+    print("")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
+    parser.add_argument("--worker-address", type=str)
+    parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
+    parser.add_argument("--max-new-tokens", type=int, default=32)
+    parser.add_argument("--message", type=str, default=
+        "Tell me a story with more than 1000 words.")
+    args = parser.parse_args()
+
+    main()
diff --git a/minigemini/train/llama_flash_attn_monkey_patch.py b/minigemini/train/llama_flash_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..4156fa8b739fb4be4e2c9dae9b5e19aca588f1ee
--- /dev/null
+++ b/minigemini/train/llama_flash_attn_monkey_patch.py
@@ -0,0 +1,233 @@
+from typing import Optional, Tuple
+import warnings
+
+import torch
+
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv, rotate_half
+
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+from flash_attn import __version__ as flash_attn_version
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_varlen_kvpacked_func,
+)
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )  # shape: (b, num_heads, s, head_dim)
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+
+    if past_key_value is not None:
+        # reuse k, v
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Transform the data into the format required by flash attention
+    qkv = torch.stack([query_states, key_states, value_states], dim=2)
+    qkv = qkv.transpose(1, 3)  # shape: [b, s, 3, num_heads, head_dim]
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = qkv.reshape(-1, 3, self.num_heads, self.head_dim)
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        max_s = q_len
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = output.view(bsz, q_len, -1)
+    else:
+        qkv = qkv.reshape(bsz, q_len, -1)
+        qkv, indices, cu_q_lens, max_s = unpad_input(qkv, key_padding_mask)
+        qkv = qkv.view(-1, 3, self.num_heads, self.head_dim)
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+
+    return self.o_proj(output), None, past_key_value
+
+def apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids):
+    gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
+    gather_indices = gather_indices.repeat(
+        1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
+    )
+    bsz = gather_indices.shape[0]
+    cos, sin = (
+        torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
+        for x in cos_sin
+    )
+    q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
+    return q, k
+
+
+def forward_inference(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+    kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
+
+    q, k, v = (
+        op(hidden_states).view(bsz, q_len, nh, self.head_dim)
+        for op, nh in (
+            (self.q_proj, self.num_heads),
+            (self.k_proj, kv_heads),
+            (self.v_proj, kv_heads),
+        )
+    )
+    # shape: (b, s, num_heads, head_dim)
+
+    kv_seq_len = k.shape[1]
+    past_kv_len = 0
+    if past_key_value is not None:
+        past_kv_len = past_key_value[0].shape[2]
+        kv_seq_len += past_kv_len
+
+    cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
+    q, k = apply_rotary_pos_emb_inference(q, k, cos_sin, position_ids)
+
+    if past_key_value is not None:
+        assert (
+            flash_attn_version >= "2.1.0"
+        ), "past_key_value support requires flash-attn >= 2.1.0"
+        # reuse k, v
+        k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
+        v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
+
+    past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
+
+    if attention_mask is None:
+        output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
+            bsz, q_len, -1
+        )
+    else:
+        q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
+        # We can skip concat and call unpad twice but seems better to call unpad only once.
+        kv, _, cu_k_lens, max_k = unpad_input(
+            torch.stack((k, v), dim=2), attention_mask
+        )
+        output_unpad = flash_attn_varlen_kvpacked_func(
+            q,
+            kv,
+            cu_q_lens,
+            cu_k_lens,
+            max_s,
+            max_k,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
+        output = pad_input(output_unpad, indices, bsz, q_len)
+
+    return self.o_proj(output), None, past_key_value
+
+
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+
+
+def _prepare_decoder_attention_mask_inference(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+
+
+def replace_llama_attn_with_flash_attn(inference=False):
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        warnings.warn(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    if inference:
+        transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask_inference
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = forward_inference
+    else:
+        transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+            _prepare_decoder_attention_mask
+        )
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
diff --git a/minigemini/train/llama_xformers_attn_monkey_patch.py b/minigemini/train/llama_xformers_attn_monkey_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8351e41ccd4a64dca237bd8f8be0702b23989dc
--- /dev/null
+++ b/minigemini/train/llama_xformers_attn_monkey_patch.py
@@ -0,0 +1,129 @@
+"""
+Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
+"""
+
+import logging
+import math
+from typing import Optional, Tuple
+
+import torch
+import transformers.models.llama.modeling_llama
+from torch import nn
+
+try:
+    import xformers.ops
+except ImportError:
+    logging.error("xformers not found! Please install it before trying to use it.")
+
+
+def replace_llama_attn_with_xformers_attn():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward
+
+
+def xformers_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    return attn_output, attn_weights, past_key_value
diff --git a/minigemini/train/llava_trainer.py b/minigemini/train/llava_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe87ce64f55a77e09e07db11547abde23f903671
--- /dev/null
+++ b/minigemini/train/llava_trainer.py
@@ -0,0 +1,303 @@
+import os
+import torch
+import torch.nn as nn
+
+from torch.utils.data import Sampler
+
+from transformers import Trainer
+from transformers.trainer import (
+    is_sagemaker_mp_enabled,
+    get_parameter_names,
+    has_length,
+    ALL_LAYERNORM_LAYERS,
+    logger,
+)
+from typing import List, Optional
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
+    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
+    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
+    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
+    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        else:
+            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
+        return iter(indices)
+
+
+class LLaVATrainer(Trainer):
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.args.lr_multi is not None:
+            lr_multi_dict = {}
+            for _dict in self.args.lr_multi.split(','):
+                _key_val = _dict.split(':')
+                print("_key_val:", _key_val)
+                lr_multi_dict[_key_val[0]] = float(_key_val[1])
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            elif self.args.lr_multi is not None:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad and not any([_key in n for _key in lr_multi_dict.keys()]))
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad and not any([_key in n for _key in lr_multi_dict.keys()]))
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+                for _key in lr_multi_dict:
+                    _key_decay = [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad and _key in n)
+                        ]
+                    _key_no_decay = [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad and _key in n)
+                        ]
+                    print("Params LR Change:", _key, "NUM:", len(_key_decay), len(_key_no_decay))
+                    if len(_key_decay) > 0:
+                        optimizer_grouped_parameters.append(
+                            {
+                                "params": _key_decay,
+                                "lr": self.args.learning_rate * lr_multi_dict[_key],
+                                "weight_decay": self.args.weight_decay,
+                            },
+                        )
+                    if len(_key_no_decay) > 0:
+                        optimizer_grouped_parameters.append(
+                            {
+                                "params": _key_no_decay,
+                                "lr": self.args.learning_rate * lr_multi_dict[_key],
+                                "weight_decay": 0.0,
+                            },
+                        )
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                        logger.info(f"skipped {module}: {skipped/2**20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped/2**20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+
+            # Only save Adapter
+            keys_to_match = ['mm_projector', 'vision_resampler']
+            keys_to_match.extend(['vlm_att', 'vlm_uni'])
+            keys_to_match.extend(['vision_fpn', 'vision_stages', 'vision_tower'])
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(['embed_tokens', 'embed_in'])
+
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        else:
+            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)
\ No newline at end of file
diff --git a/minigemini/train/train.py b/minigemini/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f4ac927e0fef390910e7a79762db1000756d999
--- /dev/null
+++ b/minigemini/train/train.py
@@ -0,0 +1,1281 @@
+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+# ------------------------------------------------------------------------
+# Modified from LLaVA (https://github.com/haotian-liu/LLaVA)
+# Copyright 2024 Yanwei Li
+# ------------------------------------------------------------------------
+import os
+import copy
+import random
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+
+import torch
+import numpy as np
+
+import transformers
+import tokenizers
+
+from minigemini.constants import (IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, 
+                             DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
+from torch.utils.data import Dataset
+from minigemini.train.llava_trainer import LLaVATrainer
+
+from minigemini import conversation as conversation_lib
+from minigemini.model import *
+from minigemini.mm_utils import tokenizer_image_token
+
+from PIL import Image
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+from packaging import version
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    vision_tower_aux: Optional[str] = field(default=None) # auxiliary vision tower
+    optimize_vision_tower: bool = field(default=False) # whether to optimize vision tower
+    optimize_vision_tower_aux: bool = field(default=False) # whether to optimize auxiliary vision tower
+    image_processor: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default='linear')
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = 'square'
+    image_grid_pinpoints: Optional[str] = field(default=None)
+    image_size_aux: Optional[int] = field(default=320)
+    image_grid: Optional[int] = field(default=1)
+    image_global: Optional[bool] = field(default=False)
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+    lr_multi: Optional[str] = field(default=None)
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ['mm_projector', 'vision_tower', 'vision_resampler', 'vlm_uni']
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if 'lm_head' in lora_module_names: # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector', 'vision_resampler', 'vlm_uni']
+        # add vision tower
+        keys_to_match.extend(['vision_tower'])
+        # add vision tower aux
+        keys_to_match.extend(['vision_fpn', 'vision_stages'])
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(['embed_tokens', 'embed_in'])
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def smart_tokenizer_and_embedding_resize(
+    special_tokens_dict: Dict,
+    tokenizer: transformers.PreTrainedTokenizer,
+    model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+
+
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+
+
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+
+
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
+                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+            replace_token = DEFAULT_IMAGE_TOKEN
+            if data_args.mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
+
+    return sources
+
+
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                print(f"WARNING: parts!=: {parts}")
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            if i != 0 and not getattr(tokenizer, "legacy", False) and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+def preprocess_gemma(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.GEMMA
+
+    # Mask targets
+    sep = "<start_of_turn>" + conv.sep + conv.roles[1] + "\n"
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                print(f"WARNING: parts!=: {parts}")
+                break
+            parts[0] += sep
+
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1 # exclude <bos>
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1 # exclude <bos>
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+def preprocess_mpt(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+
+    if has_image:
+        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            # not included <|im_end|>
+            if has_image: 
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+
+            # include <|im_end|> for all rounds
+            # if i != 0 and getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14:
+            if getattr(tokenizer, 'legacy', False) and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len += 1
+                instruction_len += 1
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+def preprocess_plain_guided(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    prompt: str = None,
+) -> Dict:
+    # add end signal and concatenate together
+    guided_prompt = []
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        guided_prompt.append(source[0]['value'].replace(DEFAULT_IMAGE_TOKEN, '').replace('\n', ''))
+        source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets, prompt=guided_prompt)
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    prompt: str = None,
+    refine_prompt: bool = False,
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.version.startswith("plain_guided"):
+        return preprocess_plain_guided(sources, tokenizer, prompt=prompt)
+    elif conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    elif conversation_lib.default_conversation.version.startswith("gemma"):
+        return preprocess_gemma(sources, tokenizer, has_image=has_image)
+    
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer, has_image=has_image)
+    
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, data_path: str,
+                 tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        list_data_dict = json.load(open(data_path, "r"))
+
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if 'image' in sample else 0
+            length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
+            cur_len = cur_len if ('image' in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        attempt, max_attempt = 0, 10
+        while attempt < max_attempt:
+            try:
+                # sample an item
+                data_dict = self._sample_item(i)
+                break
+            except:
+                attempt += 1
+                print(f"Error in loading {i}, retrying...")
+                i = random.randint(0, len(self.list_data_dict)-1)
+
+        return data_dict
+
+    def _sample_item(self, i) -> Dict[str, torch.Tensor]:
+        image = None
+        sources = self.list_data_dict[i]
+        suffix = None
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        
+        if 'image' in sources[0]:
+            image_files = self.list_data_dict[i]['image']
+            image_folder = self.data_args.image_folder
+            processor = self.data_args.image_processor
+            
+            image_total = []
+            if not isinstance(image_files, list):
+                image_files = [image_files]
+            
+            for image_file in image_files:
+                # convert image type for OCR VQA dataset
+                if 'ocr' in image_file:
+                    if not os.path.exists(os.path.join(image_folder, image_file)):
+                        image_file = image_file.replace(".jpg", ".png")
+
+                # convert image for VG dataset
+                elif 'VG_100K' in image_file:
+                    image_file = image_file.replace('VG_100K_2', 'images')
+                    image_file = image_file.replace('VG_100K', 'images')
+
+                image = Image.open(os.path.join(image_folder, image_file)).convert('RGB')
+                if self.data_args.image_aspect_ratio == 'pad':
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+                    image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
+                    image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                else:
+                    image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                
+                image_total.append(image)
+            
+            if len(image_total) > 1:
+                image = torch.stack(image_total, dim=0)
+            else:
+                image = image_total[0]
+            
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args)
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+                
+        has_image = ('image' in self.list_data_dict[i])
+        data_dict = preprocess(
+            sources,
+            self.tokenizer,
+            has_image=has_image)
+
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        
+        # generate 1 raw image and 1 aux image as input
+        if hasattr(self.data_args, 'image_size_raw') and (image is not None): 
+            data_dict['image_aux'] = image.clone()
+            raw_shape = [self.data_args.image_size_raw['height'] * self.data_args.image_grid,
+                         self.data_args.image_size_raw['width'] * self.data_args.image_grid]
+            # only apply when input is image
+            if 'image' in self.list_data_dict[i]:
+                if len(image.shape) == 3:
+                    image = torch.nn.functional.interpolate(image[None], 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)[0]
+                else:
+                    image = torch.nn.functional.interpolate(image, 
+                                                            size=raw_shape, 
+                                                            mode='bilinear', 
+                                                            align_corners=False)
+        # image exist in the data
+        if 'image' in self.list_data_dict[i]:
+            data_dict['image'] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            if hasattr(self.data_args, 'image_size_raw'):
+                data_dict['image'] = torch.zeros(3, 
+                                                 self.data_args.image_size_raw['height'] * self.data_args.image_grid, 
+                                                 self.data_args.image_size_raw['width'] * self.data_args.image_grid)
+                data_dict['image_aux'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            else:
+                data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+        
+        
+        if 'image' in data_dict and self.data_args.image_grid >= 2:
+            raw_image = data_dict['image'].reshape(3, 
+                                                   self.data_args.image_grid,
+                                                   self.data_args.image_size_raw['height'],
+                                                   self.data_args.image_grid,
+                                                   self.data_args.image_size_raw['width'])
+            raw_image = raw_image.permute(1, 3, 0, 2, 4)
+            raw_image = raw_image.reshape(-1, 3,
+                                          self.data_args.image_size_raw['height'],
+                                          self.data_args.image_size_raw['width'])
+            
+            if self.data_args.image_global:
+                global_image = data_dict['image']
+                if len(global_image.shape) == 3:
+                    global_image = global_image[None]
+                global_image = torch.nn.functional.interpolate(global_image, 
+                                                        size=[self.data_args.image_size_raw['height'],
+                                                              self.data_args.image_size_raw['width']], 
+                                                        mode='bilinear', 
+                                                        align_corners=False)
+                # [image_crops, image_global]
+                raw_image = torch.cat([raw_image, global_image], dim=0)
+            data_dict['image'] = raw_image.contiguous()
+        
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+
+        if 'image' in instances[0]:
+            images = [instance['image'] for instance in instances]
+
+            # not concat for couple images
+            if all(x is not None and x.shape == images[0].shape and len(x)!=2 for x in images) and len(images) > 1:
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
+            
+        if 'image_aux' in instances[0]:
+            images = [instance['image_aux'] for instance in instances]
+            if all(x is not None and x.shape == images[0].shape for x in images) and len(images) > 1:
+                batch['images_aux'] = torch.stack(images)
+            else:
+                batch['images_aux'] = images
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer,
+                                data_path=data_args.data_path,
+                                data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+
+
+def train(attn_implementation=None):
+    global local_rank
+
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_skip_modules=["mm_projector"],
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
+            )
+        ))
+
+    if model_args.vision_tower is not None:
+        if "mistral" in model_args.model_name_or_path:
+            model = MiniGeminiMistralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        elif "mixtral" in model_args.model_name_or_path:
+            model = MiniGeminiMixtralForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+            from deepspeed.utils import set_z3_leaf_modules
+            set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
+        elif "gemma" in model_args.model_name_or_path:
+            model = MiniGeminiGemmaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            model = MiniGeminiLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                attn_implementation=attn_implementation,
+                torch_dtype=(torch.bfloat16 if training_args.bf16 else None),   
+                **bnb_model_from_pretrained_args
+            )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            attn_implementation=attn_implementation,
+            torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    if 'mpt' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right"
+        )
+    elif "gemma" in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+        )
+    else:
+        # fix bugs after special token with use_fast=True
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=False,
+        )
+
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    elif "gemma" in model_args.version:        
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["gemma"]
+    else:
+        tokenizer.pad_token = tokenizer.unk_token
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+
+        data_args.image_processor = copy.deepcopy(vision_tower.image_processor)
+        data_args.video_processor = copy.deepcopy(vision_tower.image_processor)
+        data_args.is_multimodal = True
+
+        model.config.image_grid = data_args.image_grid
+        model.config.image_global = data_args.image_global
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.image_grid_pinpoints = data_args.image_grid_pinpoints
+        model.config.tokenizer_padding_side = tokenizer.padding_side
+        model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+        if model_args.optimize_vision_tower:
+            print('Optimize last 1/2 layers in vision tower')
+            total_num = len(vision_tower.vision_tower.vision_model.encoder.layers)
+            for _idx in range(total_num//2, total_num):
+                vision_tower.vision_tower.vision_model.encoder.layers[_idx].requires_grad_(True)
+
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_projector_lr = training_args.mm_projector_lr
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+
+    if model_args.vision_tower_aux is not None:
+        vision_tower_aux = model.get_vision_tower_aux()
+        vision_tower_aux.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
+
+        assert data_args.image_processor.image_mean == vision_tower_aux.config['preprocess_cfg']['mean'] \
+                and data_args.image_processor.image_std == vision_tower_aux.config['preprocess_cfg']['std'], \
+                'image processor should be the same'
+        
+        if model_args.optimize_vision_tower_aux:
+            print('Optimize last layer of each block in vision tower aux')
+            for _idx in range(len(vision_tower_aux.vision_stages)):
+                vision_tower_aux.vision_stages[_idx].blocks[-1].requires_grad_(True)
+        
+        data_args.image_size_raw = data_args.image_processor.crop_size.copy()
+        model_args.image_size_aux = data_args.image_size_aux
+        data_args.image_processor.crop_size['height'] = data_args.image_size_aux
+        data_args.image_processor.crop_size['width'] = data_args.image_size_aux
+        data_args.image_processor.size['shortest_edge'] = data_args.image_size_aux
+
+        model.get_model().initialize_uni_modules(model_args)
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    trainer = LLaVATrainer(model=model,
+                    tokenizer=tokenizer,
+                    args=training_args,
+                    **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
diff --git a/minigemini/train/train_mem.py b/minigemini/train/train_mem.py
new file mode 100644
index 0000000000000000000000000000000000000000..3557763b0551ee8923bce8316358a24ef99bf34f
--- /dev/null
+++ b/minigemini/train/train_mem.py
@@ -0,0 +1,4 @@
+from minigemini.train.train import train
+
+if __name__ == "__main__":
+    train(attn_implementation="flash_attention_2")
\ No newline at end of file
diff --git a/minigemini/train/train_xformers.py b/minigemini/train/train_xformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..52024a69b9f9329abd56fefe183dbd3aaf82d72a
--- /dev/null
+++ b/minigemini/train/train_xformers.py
@@ -0,0 +1,13 @@
+# Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
+
+# Need to call this before importing transformers.
+from minigemini.train.llama_xformers_attn_monkey_patch import (
+    replace_llama_attn_with_xformers_attn,
+)
+
+replace_llama_attn_with_xformers_attn()
+
+from minigemini.train.train import train
+
+if __name__ == "__main__":
+    train()
diff --git a/minigemini/utils.py b/minigemini/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e2e542fc72b0af5b7fa117794a3fdb3f97a2071
--- /dev/null
+++ b/minigemini/utils.py
@@ -0,0 +1,126 @@
+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+
+import requests
+
+from minigemini.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when='D', utc=True, encoding='UTF-8')
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {"Content-Type": "application/json",
+               "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"]}
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"