Spaces:
Runtime error
Runtime error
File size: 4,745 Bytes
a72fea7 832a4d2 f9e2c2e a72fea7 832a4d2 04cf79a a72fea7 832a4d2 a72fea7 832a4d2 99b9339 04cf79a 6300d69 3a645a0 6300d69 99b9339 bc4a9b2 99b9339 1f11a55 99b9339 bc4a9b2 99b9339 bc4a9b2 99b9339 bc4a9b2 99b9339 3a645a0 99b9339 6300d69 832a4d2 a72fea7 9b00c4f a72fea7 9b00c4f a72fea7 9b00c4f a72fea7 99b9339 a72fea7 be78dc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
from unsloth import FastLanguageModel
import torch
# Load the model and tokenizer locally
max_seq_length = 2048
model_name_or_path = "michailroussos/model_llama_8d"
# Load model and tokenizer using unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=model_name_or_path,
max_seq_length=max_seq_length,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model) # Enable optimized inference
# Define the response function
def respond(message, history, system_message, max_tokens, temperature, top_p):
print("\n" + "="*50)
print("===== RESPOND FUNCTION CALLED =====")
print("="*50)
# Print input parameters
print(f"Input Message: {message}")
print(f"System Message: {system_message}")
print(f"Max Tokens: {max_tokens}")
print(f"Temperature: {temperature}")
print(f"Top-p: {top_p}")
# Debug history
print("\n--- Current History ---")
print(f"History Type: {type(history)}")
print(f"History Content: {history}")
# Prepare the messages for the model
messages = []
try:
if history:
print("\n--- Processing Existing History ---")
for entry in history:
messages.append({"role": "user", "content": entry[0]})
messages.append({"role": "assistant", "content": entry[1]})
# Add the current user message
print("\n--- Adding Current Message ---")
messages.append({"role": "user", "content": message})
# Debug messages before tokenization
print("\n--- Messages Before Tokenization ---")
for msg in messages:
print(f"Role: {msg['role']}, Content: {msg['content']}")
# Tokenize the input
print("\n--- Tokenizing Input ---")
inputs = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda" if torch.cuda.is_available() else "cpu")
print(f"Tokenized Inputs Shape: {inputs.shape}")
print(f"Tokenized Inputs Device: {inputs.device}")
# Generate response
attention_mask = inputs.ne(tokenizer.pad_token_id).long()
try:
generated_tokens = model.generate(
input_ids=inputs,
attention_mask=attention_mask,
max_new_tokens=max_tokens,
use_cache=True,
temperature=temperature,
top_p=top_p,
)
# Decode the generated response
response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print("\n--- Generated Response ---")
print(f"Raw Response: {response}")
# Check and filter response
#if "system" in response.lower():
# print("WARNING: System message detected in response")
# response = "Hello! How can I assist you today?"
# Prepare return history in OpenAI messages format
return_messages = []
for entry in (history or []):
return_messages.append({"role": "user", "content": entry[0]})
return_messages.append({"role": "assistant", "content": entry[1]})
# Add current conversation turn
return_messages.append({"role": "user", "content": message})
return_messages.append({"role": "assistant", "content": response})
print("\n--- Return Messages ---")
for msg in return_messages:
print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
return return_messages
except Exception as gen_error:
print("\n--- GENERATION ERROR ---")
print(f"Error during model generation: {gen_error}")
return []
except Exception as prep_error:
print("\n--- PREPARATION ERROR ---")
print(f"Error during message preparation: {prep_error}")
return []
# Define the Gradio interface
demo = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Textbox(value="You are a helpful assistant.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
],
type="messages" # Explicitly set to messages type
)
if __name__ == "__main__":
demo.launch(share=False) # Use share=False for local testing |