import inspect
import os
import re
import warnings
from collections import OrderedDict
from dataclasses import dataclass
from functools import partial
from pathlib import Path
from ctypes import (
    CDLL,
    c_bool,
    c_int,
    c_float,
    c_char_p,
    c_void_p,
    POINTER,
    Structure,
)
from typing import (
    Any,
    Callable,
    Generator,
    List,
    Optional,
    Sequence,
    Union,
)

from .lib import find_library, load_cuda
from .logger import logger
from .utils import is_gguf, Vector, utf8_split_incomplete

c_int_p = POINTER(c_int)
c_float_p = POINTER(c_float)
llm_p = c_void_p


@dataclass
class Config:
    # sample
    top_k: int = 40
    top_p: float = 0.95
    temperature: float = 0.8
    repetition_penalty: float = 1.1
    last_n_tokens: int = 64
    seed: int = -1

    # eval
    batch_size: int = 8
    threads: int = -1

    # generate
    max_new_tokens: int = 256
    stop: Optional[Sequence[str]] = None
    stream: bool = False
    reset: bool = True

    # model
    context_length: int = -1
    gpu_layers: int = 0
    mmap: bool = True
    mlock: bool = False

    def to_struct(self):
        return ConfigStruct(
            context_length=self.context_length,
            gpu_layers=self.gpu_layers,
            mmap=self.mmap,
            mlock=self.mlock,
        )


class ConfigStruct(Structure):
    _fields_ = [
        ("context_length", c_int),
        ("gpu_layers", c_int),
        ("mmap", c_bool),
        ("mlock", c_bool),
    ]


docs = OrderedDict(
    top_k="The top-k value to use for sampling.",
    top_p="The top-p value to use for sampling.",
    temperature="The temperature to use for sampling.",
    repetition_penalty="The repetition penalty to use for sampling.",
    last_n_tokens="The number of last tokens to use for repetition penalty.",
    seed="The seed value to use for sampling tokens.",
    max_new_tokens="The maximum number of new tokens to generate.",
    stop="A list of sequences to stop generation when encountered.",
    stream="Whether to stream the generated text.",
    reset="Whether to reset the model state before generating text.",
    batch_size="The batch size to use for evaluating tokens in a single prompt.",
    threads="The number of threads to use for evaluating tokens.",
    context_length="The maximum context length to use.",
    gpu_layers="The number of layers to run on GPU.",
)


def doc(fn):
    doc = []
    for param in inspect.signature(fn).parameters:
        if param in docs:
            default = getattr(Config, param)
            doc.append(f"{param}: {docs[param]} Default: `{default}`")
    doc = ("\n" + " " * 12).join(doc)
    fn.__doc__ = fn.__doc__.format(params=doc)
    return fn


def get(*values):
    for value in values:
        if value is not None:
            return value


def load_library(path: Optional[str] = None, gpu: bool = False) -> Any:
    # https://docs.python.org/3.8/whatsnew/3.8.html#bpo-36085-whatsnew
    # https://github.com/abetlen/llama-cpp-python/pull/225
    if hasattr(os, "add_dll_directory") and "CUDA_PATH" in os.environ:
        os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))

    path = find_library(path, gpu=gpu)
    if "cuda" in path:
        load_cuda()
    lib = CDLL(path)

    lib.ctransformers_llm_create.argtypes = [
        c_char_p,  # model_path
        c_char_p,  # model_type
        ConfigStruct,  # config
    ]
    lib.ctransformers_llm_create.restype = llm_p

    lib.ctransformers_llm_delete.argtypes = [llm_p]
    lib.ctransformers_llm_delete.restype = None

    lib.ctransformers_llm_tokenize.argtypes = [
        llm_p,
        c_char_p,  # text
        c_bool,  # add_bos_token
        c_int_p,  # output
    ]
    lib.ctransformers_llm_tokenize.restype = c_int

    lib.ctransformers_llm_detokenize.argtypes = [
        llm_p,
        c_int,  # token
    ]
    lib.ctransformers_llm_detokenize.restype = c_char_p

    lib.ctransformers_llm_is_eos_token.argtypes = [
        llm_p,
        c_int,  # token
    ]
    lib.ctransformers_llm_is_eos_token.restype = c_bool

    lib.ctransformers_llm_eos_token_id.argtypes = [llm_p]
    lib.ctransformers_llm_eos_token_id.restype = c_int

    lib.ctransformers_llm_bos_token_id.argtypes = [llm_p]
    lib.ctransformers_llm_bos_token_id.restype = c_int

    lib.ctransformers_llm_vocab_size.argtypes = [llm_p]
    lib.ctransformers_llm_vocab_size.restype = c_int

    lib.ctransformers_llm_context_length.argtypes = [llm_p]
    lib.ctransformers_llm_context_length.restype = c_int

    lib.ctransformers_llm_architecture.argtypes = [llm_p]
    lib.ctransformers_llm_architecture.restype = c_char_p

    lib.ctransformers_llm_batch_eval.argtypes = [
        llm_p,
        c_int_p,  # tokens
        c_int,  # n_tokens
        c_int,  # n_past
        c_int,  # batch_size
        c_int,  # threads
    ]
    lib.ctransformers_llm_batch_eval.restype = c_bool

    lib.ctransformers_llm_logits_data.argtypes = [llm_p]
    lib.ctransformers_llm_logits_data.restype = c_float_p
    lib.ctransformers_llm_logits_size.argtypes = [llm_p]
    lib.ctransformers_llm_logits_size.restype = c_int

    lib.ctransformers_llm_embeddings_data.argtypes = [llm_p]
    lib.ctransformers_llm_embeddings_data.restype = c_float_p
    lib.ctransformers_llm_embeddings_size.argtypes = [llm_p]
    lib.ctransformers_llm_embeddings_size.restype = c_int

    lib.ctransformers_llm_sample.argtypes = [
        llm_p,
        c_int_p,  # last_tokens
        c_int,  # n_last
        c_int,  # top_k
        c_float,  # top_p
        c_float,  # temperature
        c_float,  # repetition_penalty
        c_int,  # seed
    ]
    lib.ctransformers_llm_sample.restype = c_int

    lib.ctransformers_llm_reset.argtypes = [llm_p]
    lib.ctransformers_llm_reset.restype = None

    return lib


class LLM:
    def __init__(
        self,
        model_path: str,
        model_type: Optional[str] = None,
        *,
        config: Optional[Config] = None,
        lib: Optional[str] = None,
    ):
        """Loads the language model from a local file.

        Args:
            model_path: The path to a model file.
            model_type: The model type.
            config: `Config` object.
            lib: The path to a shared library or one of `avx2`, `avx`, `basic`.
        """
        config = config or Config()
        self._model_path = model_path
        self._config = config
        self._llm = None
        self._lib = None
        self._context = []

        if not Path(model_path).is_file():
            raise ValueError(f"Model path '{model_path}' doesn't exist.")

        if not model_type:
            if not is_gguf(model_path):
                raise ValueError(
                    "Unable to detect model type. Please specify a model type using:\n\n"
                    "  AutoModelForCausalLM.from_pretrained(..., model_type='...')\n\n"
                )
            model_type = "gguf"

        self._lib = load_library(lib, gpu=config.gpu_layers > 0)
        self._llm = self._lib.ctransformers_llm_create(
            model_path.encode(),
            model_type.encode(),
            config.to_struct(),
        )
        if self._llm is None:
            raise RuntimeError(
                f"Failed to create LLM '{model_type}' from '{model_path}'."
            )
        architecture = self.ctransformers_llm_architecture().decode()
        if architecture:
            model_type = architecture
        self._model_type = model_type

    @property
    def model_path(self) -> str:
        """The path to the model file."""
        return self._model_path

    @property
    def model_type(self) -> str:
        """The model type."""
        return self._model_type

    @property
    def config(self) -> Config:
        """The config object."""
        return self._config

    @property
    def eos_token_id(self) -> int:
        """The end-of-sequence token."""
        return self.ctransformers_llm_eos_token_id()

    @property
    def bos_token_id(self) -> int:
        """The beginning-of-sequence token."""
        return self.ctransformers_llm_bos_token_id()

    @property
    def pad_token_id(self) -> int:
        """The padding token."""
        return self.ctransformers_llm_eos_token_id()

    @property
    def vocab_size(self) -> int:
        """The number of tokens in vocabulary."""
        return self.ctransformers_llm_vocab_size()

    @property
    def context_length(self) -> int:
        """The context length of model."""
        return self.ctransformers_llm_context_length()

    @property
    def logits(self) -> List[float]:
        """The unnormalized log probabilities."""
        return Vector(
            self.ctransformers_llm_logits_data(),
            self.ctransformers_llm_logits_size(),
        )

    @property
    def embeddings(self) -> List[float]:
        """The input embeddings."""
        return Vector(
            self.ctransformers_llm_embeddings_data(),
            self.ctransformers_llm_embeddings_size(),
        )

    def __getattr__(self, name: str) -> Callable:
        lib, llm = self._lib, self._llm
        if name.startswith("ctransformers_llm_") and hasattr(lib, name):
            return partial(getattr(lib, name), llm)
        raise AttributeError(f"'LLM' object has no attribute '{name}'")

    def tokenize(self, text: str, add_bos_token: Optional[bool] = None) -> List[int]:
        """Converts a text into list of tokens.

        Args:
            text: The text to tokenize.
            add_bos_token: Whether to add the beginning-of-sequence token.

        Returns:
            The list of tokens.
        """
        if add_bos_token is None:
            add_bos_token = self.model_type == "llama"
        tokens = (c_int * (len(text) + 1))()
        n_tokens = self.ctransformers_llm_tokenize(text.encode(), add_bos_token, tokens)
        return tokens[:n_tokens]

    def detokenize(
        self,
        tokens: Sequence[int],
        decode: bool = True,
    ) -> Union[str, bytes]:
        """Converts a list of tokens to text.

        Args:
            tokens: The list of tokens.
            decode: Whether to decode the text as UTF-8 string.

        Returns:
            The combined text of all tokens.
        """
        if isinstance(tokens, int):
            tokens = [tokens]
        texts = []
        for token in tokens:
            text = self.ctransformers_llm_detokenize(token)
            texts.append(text)
        texts = b"".join(texts)
        if decode:
            texts = texts.decode(errors="ignore")
            # https://github.com/ggerganov/llama.cpp/blob/43033b7bb4858da4f591715b3babdf906c9b7cbc/common/common.cpp#L778-L781
            if tokens[:1] == [self.bos_token_id] and texts[:1] == " ":
                texts = texts[1:]
        return texts

    def is_eos_token(self, token: int) -> bool:
        """Checks if a token is an end-of-sequence token.

        Args:
            token: The token to check.

        Returns:
            `True` if the token is an end-of-sequence token else `False`.
        """
        return self.ctransformers_llm_is_eos_token(token)

    @doc
    def eval(
        self,
        tokens: Sequence[int],
        *,
        batch_size: Optional[int] = None,
        threads: Optional[int] = None,
    ) -> None:
        """Evaluates a list of tokens.

        Args:
            tokens: The list of tokens to evaluate.
            {params}
        """
        config = self.config
        batch_size = get(batch_size, config.batch_size)
        threads = get(threads, config.threads)

        n_past = len(self._context)
        n_tokens = len(tokens)
        if n_past + n_tokens > self.context_length:
            logger.warning(
                f"Number of tokens ({n_past + n_tokens}) exceeded maximum context length ({self.context_length})."
            )
        tokens = (c_int * n_tokens)(*tokens)
        status = self.ctransformers_llm_batch_eval(
            tokens,
            n_tokens,
            n_past,
            batch_size,
            threads,
        )
        if not status:
            raise RuntimeError("Failed to evaluate tokens.")
        self._context.extend(tokens)

    @doc
    def sample(
        self,
        *,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        temperature: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        last_n_tokens: Optional[int] = None,
        seed: Optional[int] = None,
    ) -> int:
        """Samples a token from the model.

        Args:
            {params}

        Returns:
            The sampled token.
        """
        config = self.config
        top_k = get(top_k, config.top_k)
        top_p = get(top_p, config.top_p)
        temperature = get(temperature, config.temperature)
        repetition_penalty = get(repetition_penalty, config.repetition_penalty)
        last_n_tokens = get(last_n_tokens, config.last_n_tokens)
        seed = get(seed, config.seed)

        if last_n_tokens < 0:
            last_n_tokens = self.context_length
        last_tokens = self._context[-last_n_tokens:]
        n_last = len(last_tokens)
        last_tokens = (c_int * n_last)(*last_tokens)

        return self.ctransformers_llm_sample(
            last_tokens,
            n_last,
            top_k,
            top_p,
            temperature,
            repetition_penalty,
            seed,
        )

    def reset(self) -> None:
        """Deprecated since 0.2.27."""
        warnings.warn(
            "`LLM.reset()` method is deprecated since 0.2.27. Please use high-level API."
        )
        self._context.clear()
        self.ctransformers_llm_reset()

    def __del__(self):
        if self._llm is not None:
            self.ctransformers_llm_delete()

    @doc
    def prepare_inputs_for_generation(
        self,
        tokens: Sequence[int],
        *,
        reset: Optional[bool] = None,
    ) -> Sequence[int]:
        """Removes input tokens that are evaluated in the past and updates the LLM context.

        Args:
            tokens: The list of input tokens.
            {params}

        Returns:
            The list of tokens to evaluate.
        """
        config = self.config
        reset = get(reset, config.reset)

        if not reset:
            return tokens

        # Keep at least one input token to evaluate the logits.
        n = min(len(tokens) - 1, len(self._context))
        l = 0
        while l < n and tokens[l] == self._context[l]:
            l += 1
        # Remove input tokens that are evaluated in the past and update context.
        tokens = tokens[l:]
        self._context = self._context[:l]

        return tokens

    @doc
    def generate(
        self,
        tokens: Sequence[int],
        *,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        temperature: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        last_n_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        batch_size: Optional[int] = None,
        threads: Optional[int] = None,
        reset: Optional[bool] = None,
    ) -> Generator[int, None, None]:
        """Generates new tokens from a list of tokens.

        Args:
            tokens: The list of tokens to generate tokens from.
            {params}

        Returns:
            The generated tokens.
        """
        tokens = self.prepare_inputs_for_generation(tokens, reset=reset)
        self.eval(tokens, batch_size=batch_size, threads=threads)
        while True:
            token = self.sample(
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                repetition_penalty=repetition_penalty,
                last_n_tokens=last_n_tokens,
                seed=seed,
            )
            self.eval([token], batch_size=batch_size, threads=threads)
            if self.is_eos_token(token):
                break
            yield token

    def _stream(
        self,
        prompt: str,
        *,
        max_new_tokens: Optional[int] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        temperature: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        last_n_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        batch_size: Optional[int] = None,
        threads: Optional[int] = None,
        stop: Optional[Sequence[str]] = None,
        reset: Optional[bool] = None,
    ) -> Generator[str, None, None]:
        config = self.config
        max_new_tokens = get(max_new_tokens, config.max_new_tokens)
        stop = get(stop, config.stop) or []
        if isinstance(stop, str):
            stop = [stop]

        tokens = self.tokenize(prompt)

        stop_regex = re.compile("|".join(map(re.escape, stop)))
        count = 0
        text = ""
        incomplete = b""
        for token in self.generate(
            tokens,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            last_n_tokens=last_n_tokens,
            seed=seed,
            batch_size=batch_size,
            threads=threads,
            reset=reset,
        ):
            # Handle incomplete UTF-8 multi-byte characters.
            incomplete += self.detokenize([token], decode=False)
            complete, incomplete = utf8_split_incomplete(incomplete)
            text += complete.decode(errors="ignore")

            # https://github.com/abetlen/llama-cpp-python/blob/1a13d76c487df1c8560132d10bda62d6e2f4fa93/llama_cpp/llama.py#L686-L706
            # Check if one of the stop sequences is part of the text.
            # Note that the stop sequence may not always be at the end of text.
            if stop:
                match = stop_regex.search(text)
                if match:
                    text = text[: match.start()]
                    break

            # Avoid sending the longest suffix of text which is also a prefix
            # of a stop sequence, as it can form a stop sequence with the text
            # generated later.
            longest = 0
            for s in stop:
                for i in range(len(s), 0, -1):
                    if text.endswith(s[:i]):
                        longest = max(i, longest)
                        break

            end = len(text) - longest
            if end > 0:
                yield text[:end]
                text = text[end:]

            count += 1
            if count >= max_new_tokens:
                break

        if text:
            yield text

    @doc
    def __call__(
        self,
        prompt: str,
        *,
        max_new_tokens: Optional[int] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        temperature: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        last_n_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        batch_size: Optional[int] = None,
        threads: Optional[int] = None,
        stop: Optional[Sequence[str]] = None,
        stream: Optional[bool] = None,
        reset: Optional[bool] = None,
    ) -> Union[str, Generator[str, None, None]]:
        """Generates text from a prompt.

        Args:
            prompt: The prompt to generate text from.
            {params}

        Returns:
            The generated text.
        """
        config = self.config
        stream = get(stream, config.stream)

        text = self._stream(
            prompt,
            max_new_tokens=max_new_tokens,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            last_n_tokens=last_n_tokens,
            seed=seed,
            batch_size=batch_size,
            threads=threads,
            stop=stop,
            reset=reset,
        )
        if stream:
            return text
        return "".join(text)

    @doc
    def embed(
        self,
        input: Union[str, Sequence[int]],
        *,
        batch_size: Optional[int] = None,
        threads: Optional[int] = None,
    ) -> List[float]:
        """Computes embeddings for a text or list of tokens.

        > **Note:** Currently only LLaMA and Falcon models support embeddings.

        Args:
            input: The input text or list of tokens to get embeddings for.
            {params}

        Returns:
            The input embeddings.
        """
        if isinstance(input, str):
            input = self.tokenize(input)
        input = self.prepare_inputs_for_generation(input, reset=True)
        self.eval(input, batch_size=batch_size, threads=threads)
        return list(self.embeddings)