michailroussos commited on
Commit
f7f950e
·
1 Parent(s): 3b2ce08

small changes

Browse files
Files changed (1) hide show
  1. app.py +16 -3
app.py CHANGED
@@ -20,13 +20,18 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
20
  # Define the response function
21
  def respond(message, history, system_message, max_tokens, temperature, top_p):
22
  messages = [{"role": "system", "content": system_message}]
 
 
23
  for user_msg, assistant_msg in history:
24
  if user_msg:
25
  messages.append({"role": "user", "content": user_msg})
26
  if assistant_msg:
27
  messages.append({"role": "assistant", "content": assistant_msg})
 
 
28
  messages.append({"role": "user", "content": message})
29
 
 
30
  inputs = tokenizer.apply_chat_template(
31
  messages,
32
  tokenize=True,
@@ -36,15 +41,23 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
36
  inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
37
  attention_mask = inputs.ne(tokenizer.pad_token_id).long()
38
 
 
39
  output = model.generate(
40
  input_ids=inputs,
 
41
  max_new_tokens=max_tokens,
42
- use_cache=True,
43
  temperature=temperature,
44
  top_p=top_p,
 
45
  )
46
- response = tokenizer.decode(output[0], skip_special_tokens=True)
47
- yield response # Return full response directly
 
 
 
 
 
 
48
 
49
 
50
  # Define the Gradio interface
 
20
  # Define the response function
21
  def respond(message, history, system_message, max_tokens, temperature, top_p):
22
  messages = [{"role": "system", "content": system_message}]
23
+
24
+ # Append past conversation history
25
  for user_msg, assistant_msg in history:
26
  if user_msg:
27
  messages.append({"role": "user", "content": user_msg})
28
  if assistant_msg:
29
  messages.append({"role": "assistant", "content": assistant_msg})
30
+
31
+ # Add the user's current message
32
  messages.append({"role": "user", "content": message})
33
 
34
+ # Tokenize the input with proper attention mask
35
  inputs = tokenizer.apply_chat_template(
36
  messages,
37
  tokenize=True,
 
41
  inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
42
  attention_mask = inputs.ne(tokenizer.pad_token_id).long()
43
 
44
+ # Generate the output using the model
45
  output = model.generate(
46
  input_ids=inputs,
47
+ attention_mask=attention_mask,
48
  max_new_tokens=max_tokens,
 
49
  temperature=temperature,
50
  top_p=top_p,
51
+ pad_token_id=tokenizer.eos_token_id, # Ensure padding is replaced with EOS
52
  )
53
+
54
+ # Decode the generated output
55
+ response = tokenizer.decode(
56
+ output[0], skip_special_tokens=True
57
+ ).strip() # Remove any extra whitespace or unexpected tokens
58
+
59
+ # Yield the clean response for display
60
+ yield response
61
 
62
 
63
  # Define the Gradio interface