michailroussos commited on
Commit
9202d9a
·
1 Parent(s): 15bfa4e
Files changed (1) hide show
  1. app.py +15 -19
app.py CHANGED
@@ -20,15 +20,14 @@ FastLanguageModel.for_inference(model) # Enable faster inference
20
  print("Model loaded successfully!")
21
 
22
  # Gradio Response Function
 
 
23
  def respond(message, max_new_tokens, temperature, system_message=""):
24
  try:
25
  # Prepare input messages
26
  messages = [{"role": "system", "content": system_message}] if system_message else []
27
  messages.append({"role": "user", "content": message})
28
 
29
- # Debug: Show messages
30
- print("[DEBUG] Messages:", messages)
31
-
32
  # Tokenize inputs
33
  input_ids = tokenizer.apply_chat_template(
34
  messages,
@@ -37,37 +36,34 @@ def respond(message, max_new_tokens, temperature, system_message=""):
37
  return_tensors="pt",
38
  ).to("cuda")
39
 
40
- # Debug: Inspect input tensor
41
- print("[DEBUG] input_ids:", input_ids)
42
-
43
  # Ensure the input tensor has the correct dimensions
44
  if input_ids.dim() != 2:
45
  raise ValueError(f"`input_ids` must be a 2D tensor. Found shape: {input_ids.shape}")
46
 
47
- # Stream response
48
- text_streamer = TextStreamer(tokenizer, skip_prompt=True)
49
- model.generate(
50
- input_ids=input_ids,
51
- max_new_tokens=max_new_tokens,
52
- temperature=temperature,
53
- use_cache=True,
54
- streamer=text_streamer,
55
- )
56
 
57
- # Get the response generated by the model
58
- # Retrieve text from the output stream (assuming this works with your setup)
59
- generated_text = text_streamer.generated_text # This assumes the `TextStreamer` accumulates the generated text
60
 
61
- # Debug: Show the response text
62
  print("[DEBUG] Generated Text:", generated_text)
63
 
64
  return generated_text
 
65
  except Exception as e:
66
  # Debug: Log errors
67
  print("[ERROR]", str(e))
68
  return f"Error: {str(e)}"
69
 
70
 
 
71
  # Gradio UI
72
  demo = gr.Interface(
73
  fn=respond,
 
20
  print("Model loaded successfully!")
21
 
22
  # Gradio Response Function
23
+ from transformers import TextStreamer
24
+
25
  def respond(message, max_new_tokens, temperature, system_message=""):
26
  try:
27
  # Prepare input messages
28
  messages = [{"role": "system", "content": system_message}] if system_message else []
29
  messages.append({"role": "user", "content": message})
30
 
 
 
 
31
  # Tokenize inputs
32
  input_ids = tokenizer.apply_chat_template(
33
  messages,
 
36
  return_tensors="pt",
37
  ).to("cuda")
38
 
 
 
 
39
  # Ensure the input tensor has the correct dimensions
40
  if input_ids.dim() != 2:
41
  raise ValueError(f"`input_ids` must be a 2D tensor. Found shape: {input_ids.shape}")
42
 
43
+ # Generate output directly
44
+ with torch.no_grad(): # No need to track gradients for inference
45
+ output = model.generate(
46
+ input_ids=input_ids,
47
+ max_new_tokens=max_new_tokens,
48
+ temperature=temperature,
49
+ use_cache=True,
50
+ )
 
51
 
52
+ # Decode the generated tokens back to text
53
+ generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
 
54
 
55
+ # Debug: Show the generated text
56
  print("[DEBUG] Generated Text:", generated_text)
57
 
58
  return generated_text
59
+
60
  except Exception as e:
61
  # Debug: Log errors
62
  print("[ERROR]", str(e))
63
  return f"Error: {str(e)}"
64
 
65
 
66
+
67
  # Gradio UI
68
  demo = gr.Interface(
69
  fn=respond,