# "Since it's an almost example, it probably won't be affected by a license."
# Importing required libraries
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever

import warnings
warnings.filterwarnings("ignore")
import datasets
import os
import json
import subprocess
import sys
import joblib
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers

import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple,Dict,Optional
from logger import logging
from exception import CustomExceptionHandling

from smolagents.gradio_ui import GradioUI
from smolagents import (
    CodeAgent,
    GoogleSearchTool,
    Model,
    Tool,
    LiteLLMModel,
    ToolCallingAgent,
    ChatMessage,tool,MessageRole
)

cache_file = "docs_processed.joblib"
if os.path.exists(cache_file):
    docs_processed = joblib.load(cache_file)
    print("Loaded docs_processed from cache.")
else:
    knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
    source_docs = [
        Document(page_content=doc["text"], metadata={"source": doc["source"].split("/")[1]}) for doc in knowledge_base
    ]

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=20,
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )
    docs_processed = text_splitter.split_documents(source_docs)
    joblib.dump(docs_processed, cache_file)
    print("Created and saved docs_processed to cache.")

class RetrieverTool(Tool):
    name = "retriever"
    description = "Uses semantic search to retrieve the parts of documentation that could be most relevant to answer your query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, docs, **kwargs):
        super().__init__(**kwargs)

        self.retriever = BM25Retriever.from_documents(
            docs,
            k=7,  
        )

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        docs = self.retriever.invoke(
            query,
        )
        return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + str(doc.page_content)
                for i, doc in enumerate(docs)
            ]
        )


# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

os.makedirs("models",exist_ok=True)

logging.info("start download")
hf_hub_download(
    repo_id="bartowski/google_gemma-3-4b-it-GGUF",
    filename="google_gemma-3-4b-it-Q4_K_M.gguf",
    local_dir="./models",
)

retriever_tool = RetrieverTool(docs_processed)

# Define the prompt markers for Gemma 3
gemma_3_prompt_markers = {
    Roles.system: PromptMarkers("", "\n"),  # System prompt should be included within user message
    Roles.user: PromptMarkers("<start_of_turn>user\n", "<end_of_turn>\n"),
    Roles.assistant: PromptMarkers("<start_of_turn>model\n", "<end_of_turn>\n"),
    Roles.tool: PromptMarkers("", ""),  # If you need tool support
}


# Create the formatter
gemma_3_formatter = MessagesFormatter(
    pre_prompt="",  # No pre-prompt
    prompt_markers=gemma_3_prompt_markers,
    include_sys_prompt_in_first_user_message=True,  # Include system prompt in first user message
    default_stop_sequences=["<end_of_turn>", "<start_of_turn>"],
    strip_prompt=False,  # Don't strip whitespace from the prompt
    bos_token="<bos>",  # Beginning of sequence token for Gemma 3
    eos_token="<eos>",  # End of sequence token for Gemma 3
)

# based https://github.com/huggingface/smolagents/pull/450
# almost overwrite with https://huggingface.co/spaces/sitammeur/Gemma-llamacpp
class LlamaCppModel(Model):
    def __init__(
        self,
        model_path: Optional[str] = None,
        repo_id: Optional[str] = None,
        filename: Optional[str] = None,
        n_gpu_layers: int = 0,
        n_ctx: int = 8192,
        max_tokens: int = 1024,
        verbose:bool = False,
        **kwargs,
    ):
        """
        Initializes the LlamaCppModel.

        Parameters:
            model_path (str, optional): Path to the local model file.
            repo_id (str, optional): Hugging Face repository ID if loading from Hugging Face.
            filename (str, optional): Specific filename to load from the repository.
            n_gpu_layers (int, default=0): Number of GPU layers to use.
            n_ctx (int, default=8192): Context size for the model.
            **kwargs: Additional keyword arguments.
        Raises:
            ValueError: If neither model_path nor repo_id+filename are provided.
        """
        from llama_cpp import Llama
        

        super().__init__(**kwargs)
        self.flatten_messages_as_text=True
        self.max_tokens = max_tokens

        if model_path:
            self.llm = Llama(
                model_path=model_path,
                flash_attn=False,
                n_gpu_layers=0,
                #n_batch=1024,
                n_ctx=n_ctx,
                n_threads=2,
                n_threads_batch=2,verbose=False
            )
            
        elif repo_id and filename:
            self.llm = Llama.from_pretrained(
                repo_id=repo_id,
                filename=filename,
                n_gpu_layers=n_gpu_layers,
                n_ctx=n_ctx,
                max_tokens=max_tokens,
                verbose=verbose,
                **kwargs
            )
        else:
            raise ValueError("Must provide either model_path or repo_id+filename")

    def __call__(
        self,
        messages: List[Dict[str, str]],
        stop_sequences: Optional[List[str]] = None,
        grammar: Optional[str] = None,
        tools_to_call_from: Optional[List[Tool]] = None,
        **kwargs,
    ) -> ChatMessage:
        

        from llama_cpp import LlamaGrammar
        try:
            completion_kwargs = self._prepare_completion_kwargs(
                messages=messages,
                stop_sequences=stop_sequences,
                grammar=grammar,
                tools_to_call_from=tools_to_call_from,
                **kwargs
            )

            if not tools_to_call_from:
                completion_kwargs.pop("tools", None)
                completion_kwargs.pop("tool_choice", None)

            filtered_kwargs = {
                k: v for k, v in completion_kwargs.items()
                if k not in ["messages", "stop", "grammar", "max_tokens", "tools_to_call_from"]
            }
            max_tokens = (
            kwargs.get("max_tokens")
            or self.max_tokens  
            or 1024 
            )

            provider = LlamaCppPythonProvider(self.llm)
            system_message= completion_kwargs["messages"][0]["content"]
            message= completion_kwargs["messages"].pop()["content"]
            
            # Create the agent
            agent = LlamaCppAgent(
                provider,
                system_prompt=f"{system_message}",
                custom_messages_formatter=gemma_3_formatter,
                debug_output=True,
            )
            temperature = 0.5
            top_k=40
            top_p=0.95
            max_tokens=2048
            repeat_penalty=1.1
            settings = provider.get_provider_default_settings()
            settings.temperature = temperature
            settings.top_k = top_k
            settings.top_p = top_p
            settings.max_tokens = max_tokens
            settings.repeat_penalty = repeat_penalty
            settings.stream = False

            
            messages = BasicChatHistory()
            for from_message in completion_kwargs["messages"]:
                if from_message["role"] is MessageRole.USER:
                    history_message = {"role": MessageRole.USER, "content": from_message["content"]}
                elif from_message["role"] is MessageRole.SYSTEM:
                    history_message = {"role": MessageRole.SYSTEM, "content": from_message["content"]}
                else:
                    history_message = {"role": MessageRole.ASSISTANT, "content": from_message["content"]}
                messages.add_message(from_message)
     
            stream = agent.get_chat_response(
            message,
            llm_sampling_settings=settings,
            chat_history=messages,
            returns_streaming_generator=False,
            print_output=False, 
            
            )
            
            content = stream
            message = ChatMessage(role=MessageRole.ASSISTANT, content=content)

            if tools_to_call_from is not None:
                return super.parse_tool_args_if_needed(message)
            return message
        except Exception as e:
            logging.error(f"Model error: {e}")
            return ChatMessage(role="assistant", content=f"Error: {str(e)}")
            

model = LlamaCppModel(
        model_path = "models/google_gemma-3-4b-it-Q4_K_M.gguf",
        n_ctx=8192,verbose=False
    )

import yaml
with open("retriever.yaml", "r") as f:
    prompt = f.read()

description="""
*CPU Rag Example with LlamaCpp*
Take a few minute.customized prompt is the key.

Reference
- [Qwen2.5-0.5B-Rag-Thinking](https://huggingface.co/spaces/Akjava/Qwen2.5-0.5B-Rag-Thinking-Flan-T5)
- [smolagents pull-450](https://github.com/huggingface/smolagents/pull/450)
- [Gemma-llamacpp](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
- [Dataset(m-ric/huggingface_doc)](https://huggingface.co/datasets/m-ric/huggingface_doc)

"""
agent = CodeAgent(prompt_templates =yaml.safe_load(prompt),model=model, tools=[retriever_tool],max_steps=1,verbosity_level=0,name="AGENT",description=description)

demo = GradioUI(agent)

if __name__ == "__main__":
    demo.launch()