import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from peft import PeftModel
import json
import os

# Load tokenizer and base model
base_model = "Qwen/Qwen2-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="cuda" if torch.cuda.is_available() else "cpu",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    trust_remote_code=True
)

# Clean up adapter_config.json before loading adapter
adapter_config_path = "./adapter/adapter_config.json"
if os.path.exists(adapter_config_path):
    with open(adapter_config_path, "r") as f:
        adapter_config = json.load(f)
    for key in ["corda_config", "eva_config", "megatron_config"]:
        adapter_config.pop(key, None)
    with open(adapter_config_path, "w") as f:
        json.dump(adapter_config, f)

# Load adapter
model = PeftModel.from_pretrained(model, "./adapter", is_trainable=False)
model.eval()

# Simple chat function
def chat(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        streamer=streamer
    )
    output = tokenizer.decode(generated_ids[0][model_inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return output

# Example
if __name__ == "__main__":
    while True:
        prompt = input("User: ")
        if prompt.lower() in ["exit", "quit"]:
            break
        print("AI:", chat(prompt))