michailroussos commited on
Commit
c8295e7
·
1 Parent(s): 832a4d2

small changes

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -20,11 +20,11 @@ FastLanguageModel.for_inference(model) # Enable optimized inference
20
  def respond(message, history, system_message, max_tokens, temperature, top_p):
21
  # Build the chat message history
22
  messages = [{"role": "system", "content": system_message}]
23
- for val in history:
24
- if val[0]: # User message
25
- messages.append({"role": "user", "content": val[0]})
26
- if val[1]: # Assistant message
27
- messages.append({"role": "assistant", "content": val[1]})
28
  messages.append({"role": "user", "content": message})
29
 
30
  # Tokenize the input messages
@@ -33,22 +33,22 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
33
  tokenize=True,
34
  add_generation_prompt=True, # Required for generation
35
  return_tensors="pt",
36
- ).to("cuda")
37
 
38
  # Initialize a TextStreamer for streaming output
39
  text_streamer = TextStreamer(tokenizer, skip_prompt=True)
40
 
41
  # Generate the model's response
42
  response = ""
43
- for output in model.generate(
44
- input_ids=inputs,
45
  streamer=text_streamer,
46
  max_new_tokens=max_tokens,
47
  use_cache=True,
48
  temperature=temperature,
49
  top_p=top_p,
50
  ):
51
- token = tokenizer.decode(output, skip_special_tokens=True)
52
  response += token
53
  yield response
54
 
 
20
  def respond(message, history, system_message, max_tokens, temperature, top_p):
21
  # Build the chat message history
22
  messages = [{"role": "system", "content": system_message}]
23
+ for user_msg, assistant_msg in history:
24
+ if user_msg: # User message
25
+ messages.append({"role": "user", "content": user_msg})
26
+ if assistant_msg: # Assistant message
27
+ messages.append({"role": "assistant", "content": assistant_msg})
28
  messages.append({"role": "user", "content": message})
29
 
30
  # Tokenize the input messages
 
33
  tokenize=True,
34
  add_generation_prompt=True, # Required for generation
35
  return_tensors="pt",
36
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
37
 
38
  # Initialize a TextStreamer for streaming output
39
  text_streamer = TextStreamer(tokenizer, skip_prompt=True)
40
 
41
  # Generate the model's response
42
  response = ""
43
+ for token_ids in model.generate(
44
+ input_ids=inputs.input_ids,
45
  streamer=text_streamer,
46
  max_new_tokens=max_tokens,
47
  use_cache=True,
48
  temperature=temperature,
49
  top_p=top_p,
50
  ):
51
+ token = tokenizer.decode(token_ids[-1:], skip_special_tokens=True) # Decode only the last token
52
  response += token
53
  yield response
54