Spaces:
Runtime error
Runtime error
michailroussos
commited on
Commit
·
c8295e7
1
Parent(s):
832a4d2
small changes
Browse files
app.py
CHANGED
@@ -20,11 +20,11 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
|
|
20 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
21 |
# Build the chat message history
|
22 |
messages = [{"role": "system", "content": system_message}]
|
23 |
-
for
|
24 |
-
if
|
25 |
-
messages.append({"role": "user", "content":
|
26 |
-
if
|
27 |
-
messages.append({"role": "assistant", "content":
|
28 |
messages.append({"role": "user", "content": message})
|
29 |
|
30 |
# Tokenize the input messages
|
@@ -33,22 +33,22 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
|
|
33 |
tokenize=True,
|
34 |
add_generation_prompt=True, # Required for generation
|
35 |
return_tensors="pt",
|
36 |
-
).to("cuda")
|
37 |
|
38 |
# Initialize a TextStreamer for streaming output
|
39 |
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
|
40 |
|
41 |
# Generate the model's response
|
42 |
response = ""
|
43 |
-
for
|
44 |
-
input_ids=inputs,
|
45 |
streamer=text_streamer,
|
46 |
max_new_tokens=max_tokens,
|
47 |
use_cache=True,
|
48 |
temperature=temperature,
|
49 |
top_p=top_p,
|
50 |
):
|
51 |
-
token = tokenizer.decode(
|
52 |
response += token
|
53 |
yield response
|
54 |
|
|
|
20 |
def respond(message, history, system_message, max_tokens, temperature, top_p):
|
21 |
# Build the chat message history
|
22 |
messages = [{"role": "system", "content": system_message}]
|
23 |
+
for user_msg, assistant_msg in history:
|
24 |
+
if user_msg: # User message
|
25 |
+
messages.append({"role": "user", "content": user_msg})
|
26 |
+
if assistant_msg: # Assistant message
|
27 |
+
messages.append({"role": "assistant", "content": assistant_msg})
|
28 |
messages.append({"role": "user", "content": message})
|
29 |
|
30 |
# Tokenize the input messages
|
|
|
33 |
tokenize=True,
|
34 |
add_generation_prompt=True, # Required for generation
|
35 |
return_tensors="pt",
|
36 |
+
).to("cuda" if torch.cuda.is_available() else "cpu")
|
37 |
|
38 |
# Initialize a TextStreamer for streaming output
|
39 |
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
|
40 |
|
41 |
# Generate the model's response
|
42 |
response = ""
|
43 |
+
for token_ids in model.generate(
|
44 |
+
input_ids=inputs.input_ids,
|
45 |
streamer=text_streamer,
|
46 |
max_new_tokens=max_tokens,
|
47 |
use_cache=True,
|
48 |
temperature=temperature,
|
49 |
top_p=top_p,
|
50 |
):
|
51 |
+
token = tokenizer.decode(token_ids[-1:], skip_special_tokens=True) # Decode only the last token
|
52 |
response += token
|
53 |
yield response
|
54 |
|