ZIISA2

Runtime error

File size: 3,643 Bytes

23a1ec8
 
 
 
 
 
8cc1ee4
23a1ec8
 
 
 
 
 
 
 
 
 
8cc1ee4
 
 
 
 
 
 
 
 
 
 
 
23a1ec8
8cc1ee4
23a1ec8
 
8cc1ee4
23a1ec8
b8298df
23a1ec8
8cc1ee4
 
 
23a1ec8
 
8cc1ee4
23a1ec8
 
 
8cc1ee4
 
 
 
 
 
 
 
 
 
 
 
 
23a1ec8
 
 
8cc1ee4
 
23a1ec8
8cc1ee4
 
 
23a1ec8
 
 
 
 
8cc1ee4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23a1ec8
 
 
8cc1ee4

from search import SemanticSearch, GoogleSearch, Document
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available
import yaml
import torch
import os  # Added for environment variables
import nltk

def load_configs(config_file: str) -> dict:
    with open(config_file, "r") as f:
        configs = yaml.safe_load(f)
    return configs

class RAGModel:
    def __init__(self, configs) -> None:
        self.configs = configs
        
        # 1. Get Hugging Face token (critical fix)
        self.hf_token = os.getenv("HUGGINGFACE_TOKEN") or configs["model"].get("hf_token")
        if not self.hf_token:
            raise ValueError(
                "Missing Hugging Face token! Set either:\n"
                "1. HUGGINGFACE_TOKEN environment variable\n"
                "2. hf_token in config.yml"
            )

        # 2. Fix model URL key (typo correction)
        model_url = configs["model"]["generation_model"]  # Fixed "genration_model" -> "generation_model"

        # 3. Add authentication to model loading
        self.model = AutoModelForCausalLM.from_pretrained(
            model_url,
            token=self.hf_token,  # Added authentication
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            attn_implementation="sdpa",
            device_map="auto"  # Better device handling
        )
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_url,
            token=self.hf_token  # Added authentication
        )

    def create_prompt(self, query, topk_items: list[str]):
        context = "\n-".join(c for c in topk_items)
        
        # Improved prompt template
        base_prompt = f"""You are an AI search assistant. Use this context to answer:
        Context: {context}
        
        Question: {query}
        
        Answer in Wikipedia-style format with these requirements:
        - Detailed technical explanations
        - Historical context where relevant
        - Numerical data when available
        - Markdown formatting for structure
        """

        dialog_template = [{"role": "user", "content": base_prompt}]
        
        # 4. Fix typo in apply_chat_template
        prompt = self.tokenizer.apply_chat_template(
            conversation=dialog_template, 
            tokenize=False, 
            add_generation_prompt=True  # Fixed "feneration" -> "generation"
        )
        return prompt

    def answer_query(self, query: str, topk_items: list[str]):
        prompt = self.create_prompt(query, topk_items)
        input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        
        # Improved generation parameters
        output = self.model.generate(
            **input_ids,
            temperature=0.7,
            max_new_tokens=1024,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.1
        )
        
        # Better text cleanup
        text = self.tokenizer.decode(
            output[0], 
            skip_special_tokens=True,  # Better than manual replace
            clean_up_tokenization_spaces=True
        )
        return text

if __name__ == "__main__":
    # Test with authentication
    configs = load_configs("rag.configs.yml")
    
    # Add temporary token check
    if "HUGGINGFACE_TOKEN" not in os.environ:
        raise RuntimeError("Set HUGGINGFACE_TOKEN environment variable first!")
    
    rag = RAGModel(configs)
    print(rag.answer_query("What's the height of Burj Khalifa?", ["Burj Khalifa is 828 meters tall"]))