Spaces:

nvidia
/

Eagle2-Demo

Running on Zero

File size: 8,223 Bytes

import logging
import re
from threading import Thread
from typing import List, Optional
import os
import torch
from transformers import (
    AutoModel,
    AutoProcessor,
    AutoConfig,
    StoppingCriteria,
    StoppingCriteriaList,
    TextIteratorStreamer,
)

from .chat_utils import Conversation, get_conv_template

logger = logging.getLogger(__name__)


def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):

    token = os.environ.get("HF_TOKEN")
    # hotfix the model to use flash attention 2
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
    config._attn_implementation = "flash_attention_2"
    config.vision_config._attn_implementation = "flash_attention_2"
    config.text_config._attn_implementation = "flash_attention_2"
    print("Successfully set the attn_implementation to flash_attention_2")

    logger.info(f"token = {token[:4]}***{token[-2:]}")
    model = AutoModel.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        use_auth_token=token
    )
    model.to("cuda")
    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)

    return model, processor

def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):

    token = os.environ.get("HF_TOKEN")
    # hotfix the model to use flash attention 2
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, use_auth_token=token)
    config._attn_implementation = "flash_attention_2"
    config.vision_config._attn_implementation = "flash_attention_2"
    config.text_config._attn_implementation = "flash_attention_2"
    print("Successfully set the attn_implementation to flash_attention_2")

    logger.info(f"token = {token[:4]}***{token[-2:]}")
    model = AutoModel.from_pretrained(
        model_path,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        attn_implementation="flash_attention_2",
        use_auth_token=token
    )
    model.to("cuda")
    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True, use_fast=True, use_auth_token=token)

    return model, processor

def load_model(model_path: str = "nvidia/Eagle-2.5-8B"):
    try:
        model, processor = load_model_from_nv(model_path)
    except Exception as e:
        logger.error(f"Failed to load model from HF, trying to load from eagle: {e}")
        model, processor = load_model_from_eagle()
    return model, processor

class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops=[], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
        for stop in self.stops:
            if input_ids.shape[-1] < len(stop):
                continue
            if torch.all((stop == input_ids[0][-len(stop) :])).item():
                return True

        return False


def preprocess(
    messages: list[dict],
    processor,
):
    """
    Build messages from the conversations and images.
    """
    # get images from conversations
    results = []
    # get texts from conversations
    # converstion = get_conv_template(sft_format)
    # only use the last 3 round of messages
    # latest_messages = messages[-3:]
    for mid, message in enumerate(messages):
        if message["role"] == "user":
            record = {
                "role": message["role"],
                "content": [],
            }
            if "images" in message:
                per_round_images = message["images"]
                for image in per_round_images:
                    record["content"].append(
                        {
                            "type": "image",
                            "image": image,
                        }
                    )
            if 'content' in message:
                record["content"].append(
                    {
                        "type": "text",
                        "text": str(message["content"]).strip(),
                    }
                )
            results.append(record)
        elif message["role"] == "assistant":
            formatted_answer = message["content"].strip()
            # ◁think▷用户说了“你好”，这是一个非常简单的问候，通常用于开启对话。我需要判断用户的意图。可能性一：用户只是礼貌性地打招呼，想要开启一段对话；可能性二：用户可能有更具体的需求，比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息，我需要保持开放，同时引导用户进一步说明他们的需求。
            # 我的回复需要既友好又开放，不能显得过于正式或冷漠。同时，我需要避免假设用户的具体需求，而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好！很高兴见到你。有什么我可以帮助你的吗
            # delete all the texts between ◁think▷ and ◁/think▷
            # FIXME: this is a hack to remove the thinking texts
            # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
            think_end_token = '◁/think▷'
            formatted_answer = formatted_answer.split(think_end_token)[-1]
            results.append(
                {
                    "role": message["role"],
                    "content": [
                        {
                            "type": "text",
                            "text": formatted_answer,
                        }
                    ],
                }
            )
            assert (
                formatted_answer.count(processor.image_token) == 0
            ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
            
    print(f"messages = {results}")
    text = processor.apply_chat_template(results, add_generation_prompt=False)
    print(f"raw text = {text}")
    
    image_inputs, video_inputs = processor.process_vision_info(results)

    inputs = processor(
        images=image_inputs,
        videos=video_inputs,
        text=[text],
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    return inputs


@torch.no_grad()
@torch.inference_mode()
def eagle_vl_generate(
    model: torch.nn.Module,
    processor: AutoProcessor,
    conversations: list[Conversation],
    stop_words: list,
    max_length: int = 256,
    temperature: float = 1.0,
    top_p: float = 1.0,
    chunk_size: int = -1,
):
    # convert conversation to inputs
    print(f"conversations = {conversations}")
    inputs = preprocess(conversations, processor=processor)
    inputs = inputs.to(model.device)

    return generate(
        model,
        processor,
        inputs,
        max_gen_len=max_length,
        temperature=temperature,
        top_p=top_p,
        stop_words=stop_words,
        chunk_size=chunk_size,
    )


def generate(
    model,
    processor,
    inputs,
    max_gen_len: int = 256,
    temperature: float = 0,
    top_p: float = 0.95,
    stop_words: List[str] = [],
    chunk_size: int = -1,
):
    """Stream the text output from the multimodality model with prompt and image inputs."""
    tokenizer = processor.tokenizer
    stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)

    kwargs = dict(
        **inputs,
        max_new_tokens=max_gen_len,
        do_sample=True,
        streamer=streamer,
        stopping_criteria=stopping_criteria,
    )

    if temperature > 0:
        kwargs.update(
            {
                "do_sample": True,
                "top_p": top_p,
                "temperature": temperature,
            }
        )
    else:
        kwargs["do_sample"] = False

    thread = Thread(target=model.generate, kwargs=kwargs)
    thread.start()

    yield from streamer