michailroussos commited on
Commit
1ea5080
·
1 Parent(s): 37a21af

response change

Browse files
Files changed (1) hide show
  1. app.py +10 -20
app.py CHANGED
@@ -19,40 +19,30 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
19
 
20
  # Define the response function
21
  def respond(message, history, system_message, max_tokens, temperature, top_p):
22
- # Build the chat message history
23
  messages = [{"role": "system", "content": system_message}]
24
  for user_msg, assistant_msg in history:
25
- if user_msg: # User message
26
  messages.append({"role": "user", "content": user_msg})
27
- if assistant_msg: # Assistant message
28
  messages.append({"role": "assistant", "content": assistant_msg})
29
  messages.append({"role": "user", "content": message})
30
 
31
- # Tokenize the input messages
32
  inputs = tokenizer.apply_chat_template(
33
  messages,
34
  tokenize=True,
35
- add_generation_prompt=True, # Required for generation
36
  return_tensors="pt",
37
- )
38
- input_ids = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
39
-
40
- # Initialize a TextStreamer for streaming output
41
- text_streamer = TextStreamer(tokenizer, skip_prompt=True)
42
-
43
- # Generate the model's response
44
- response = ""
45
- for token_ids in model.generate(
46
- input_ids=input_ids, # Use the tensor directly
47
- streamer=text_streamer,
48
  max_new_tokens=max_tokens,
49
  use_cache=True,
50
  temperature=temperature,
51
  top_p=top_p,
52
- ):
53
- token = tokenizer.decode(token_ids[-1:], skip_special_tokens=True) # Decode only the last token
54
- response += token
55
- yield response
56
 
57
 
58
  # Define the Gradio interface
 
19
 
20
  # Define the response function
21
  def respond(message, history, system_message, max_tokens, temperature, top_p):
 
22
  messages = [{"role": "system", "content": system_message}]
23
  for user_msg, assistant_msg in history:
24
+ if user_msg:
25
  messages.append({"role": "user", "content": user_msg})
26
+ if assistant_msg:
27
  messages.append({"role": "assistant", "content": assistant_msg})
28
  messages.append({"role": "user", "content": message})
29
 
 
30
  inputs = tokenizer.apply_chat_template(
31
  messages,
32
  tokenize=True,
33
+ add_generation_prompt=True,
34
  return_tensors="pt",
35
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
36
+
37
+ output = model.generate(
38
+ input_ids=inputs,
 
 
 
 
 
 
 
39
  max_new_tokens=max_tokens,
40
  use_cache=True,
41
  temperature=temperature,
42
  top_p=top_p,
43
+ )
44
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
45
+ return response # Return full response directly
 
46
 
47
 
48
  # Define the Gradio interface