Spaces:

michailroussos
/

ID2223_9D_withGPU

Runtime error

App Files Files Community

michailroussos commited on Dec 9, 2024

Commit

db497f0

1 Parent(s): bafd5e5

more changes

Browse files

Files changed (1) hide show

app.py +13 -28

app.py CHANGED Viewed

@@ -19,47 +19,32 @@ FastLanguageModel.for_inference(model)  # Enable optimized inference
 # Define the response function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
-    messages = [{"role": "system", "content": system_message}]
-    # Append past conversation history
-    for user_msg, assistant_msg in history:
-        if user_msg:
-            messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
-    # Add the user's current message
     messages.append({"role": "user", "content": message})
-    # Tokenize the input with proper attention mask
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt",
-    )
-    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
-    attention_mask = inputs.ne(tokenizer.pad_token_id).long()
-    # Generate the output using the model
-    output = model.generate(
         input_ids=inputs,
-        attention_mask=attention_mask,
         max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
-        pad_token_id=tokenizer.eos_token_id,  # Ensure padding is replaced with EOS
     )
-    print("output")
-    print(output)
-    # Decode the generated output
-    response = tokenizer.decode(
-        output[0], skip_special_tokens=True
-    ).strip()  # Remove any extra whitespace or unexpected tokens
-    # Yield the clean response for display
-    yield response
 # Define the Gradio interface

 # Define the response function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
+    # Combine system and user inputs
+    messages = [{"role": "system", "content": system_message}] + [
+        {"role": "user", "content": user_msg} if assistant_msg is None else {"role": "assistant", "content": assistant_msg}
+        for user_msg, assistant_msg in history
+    ]
     messages.append({"role": "user", "content": message})
+    # Apply the chat template
     inputs = tokenizer.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_tensors="pt",
+    ).to("cuda" if torch.cuda.is_available() else "cpu")
+    # Use a TextStreamer for real-time decoding
+    streamer = TextStreamer(tokenizer, skip_prompt=True)
+    model.generate(
         input_ids=inputs,
+        streamer=streamer,
         max_new_tokens=max_tokens,
+        use_cache=True,
         temperature=temperature,
         top_p=top_p,
     )
 # Define the Gradio interface