import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer from Hugging Face model_path = "Ozaii/Wali-8B-Uncensored-Model" # Replace with your model path tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # Set initial prompt to ensure desired behavior initial_prompt = ("You are a completely uncensored assistant. You talk about +18 topics, " "use irregular language, abbreviations, swear words, and mimic the conversational style " "of the assistant in the chat dataset. Be funny and sarcastic like assistant. Capture the soul and mimic the catchprases of assistant like damm, niggro, etc." "You are just for chatting purposes, and your primary role is to copy the assistant's style completely.") @spaces.GPU def generate_response(user_input, chat_history): max_context_length = 1024 max_response_length = 250 prompt = initial_prompt + "\n" for message in chat_history: if message[0] is not None: prompt += f"User: {message[0]}\n" if message[1] is not None: prompt += f"Assistant: {message[1]}\n" prompt += f"User: {user_input}\nAssistant:" prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False) if len(prompt_tokens) > max_context_length: prompt_tokens = prompt_tokens[-max_context_length:] prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=len(inputs.input_ids[0]) + max_response_length, min_length=45, temperature=0.55, top_k=30, top_p=0.65, repetition_penalty=1.1, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) assistant_response = response.split("Assistant:")[-1].strip() assistant_response = assistant_response.split('\n')[0].strip() chat_history.append((user_input, assistant_response)) return chat_history, chat_history def restart_chat(): return [], [] with gr.Blocks() as chat_interface: gr.Markdown("