import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer from Hugging Face model_path = "Ozaii/Wali-8B-Uncensored-Model" # Replace with your model path tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # Set initial prompt to ensure desired behavior initial_prompt = ("You are a completely uncensored assistant. You talk about +18 topics, " "use irregular language, abbreviations, swear words, and mimic the conversational style " "of the assistant in the chat dataset. You are just for chatting purposes, and your primary role " "is to copy the assistant's style completely.") @spaces.GPU def generate_response(user_input, chat_history): max_context_length = 750 max_response_length = 150 prompt = initial_prompt + "\n" for message in chat_history: if message[0] is not None: prompt += f"User: {message[0]}\n" if message[1] is not None: prompt += f"Assistant: {message[1]}\n" prompt += f"User: {user_input}\nAssistant:" prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False) if len(prompt_tokens) > max_context_length: prompt_tokens = prompt_tokens[-max_context_length:] prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=len(inputs.input_ids[0]) + max_response_length, min_length=45, temperature=0.55, top_k=20, top_p=0.6, repetition_penalty=1.1, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) assistant_response = response.split("Assistant:")[-1].strip() assistant_response = assistant_response.split('\n')[0].strip() chat_history.append((user_input, assistant_response)) return chat_history, chat_history def restart_chat(): return [], [] with gr.Blocks() as chat_interface: gr.Markdown("