|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer |
|
from peft import PeftModel |
|
import json |
|
import os |
|
|
|
|
|
base_model = "Qwen/Qwen2-0.5B-Instruct" |
|
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) |
|
model = AutoModelForCausalLM.from_pretrained( |
|
base_model, |
|
device_map="cuda" if torch.cuda.is_available() else "cpu", |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
trust_remote_code=True |
|
) |
|
|
|
|
|
adapter_config_path = "./adapter/adapter_config.json" |
|
if os.path.exists(adapter_config_path): |
|
with open(adapter_config_path, "r") as f: |
|
adapter_config = json.load(f) |
|
for key in ["corda_config", "eva_config", "megatron_config"]: |
|
adapter_config.pop(key, None) |
|
with open(adapter_config_path, "w") as f: |
|
json.dump(adapter_config, f) |
|
|
|
|
|
model = PeftModel.from_pretrained(model, "./adapter", is_trainable=False) |
|
model.eval() |
|
|
|
|
|
def chat(prompt): |
|
messages = [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": prompt} |
|
] |
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
|
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
|
generated_ids = model.generate( |
|
**model_inputs, |
|
max_new_tokens=512, |
|
do_sample=True, |
|
temperature=0.7, |
|
streamer=streamer |
|
) |
|
output = tokenizer.decode(generated_ids[0][model_inputs["input_ids"].shape[-1]:], skip_special_tokens=True) |
|
return output |
|
|
|
|
|
if __name__ == "__main__": |
|
while True: |
|
prompt = input("User: ") |
|
if prompt.lower() in ["exit", "quit"]: |
|
break |
|
print("AI:", chat(prompt)) |
|
|