michailroussos commited on
Commit
0787acc
·
1 Parent(s): 029560f
Files changed (1) hide show
  1. app.py +58 -31
app.py CHANGED
@@ -1,30 +1,50 @@
1
  import gradio as gr
2
- from transformers import TextStreamer
3
  from unsloth import FastLanguageModel
 
 
4
 
5
- # Define constants
 
6
  max_seq_length = 2048
7
  dtype = None
8
- model_name_or_path = "michailroussos/model_llama_8d"
9
 
10
- # Load the model and tokenizer
11
  model, tokenizer = FastLanguageModel.from_pretrained(
12
  model_name=model_name_or_path,
13
  max_seq_length=max_seq_length,
14
  dtype=dtype,
15
  load_in_4bit=True,
16
  )
17
-
18
- # Optimize model for inference
19
  FastLanguageModel.for_inference(model)
 
20
 
21
- # Function to generate a response
22
- def chat_with_model(user_message, chat_history=None):
 
 
 
 
 
 
 
23
  try:
24
- # Prepare the input messages
25
- messages = [{"role": "user", "content": user_message}]
26
-
27
- # Tokenize and prepare inputs for the model
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  inputs = tokenizer.apply_chat_template(
29
  messages,
30
  tokenize=True,
@@ -32,36 +52,43 @@ def chat_with_model(user_message, chat_history=None):
32
  return_tensors="pt",
33
  ).to("cuda")
34
 
 
 
 
35
  # Generate response
36
  output_ids = model.generate(
37
  input_ids=inputs["input_ids"],
38
- attention_mask=inputs["attention_mask"], # Ensure attention_mask is included
39
- streamer=None, # Collect output as tensor
40
- max_new_tokens=128,
 
41
  use_cache=True,
42
- temperature=1.5,
43
- min_p=0.1,
44
  )
45
 
46
- # Decode the generated tokens into a string
47
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
 
 
 
 
48
 
49
- # Append the response to the chat history
50
- if chat_history is None:
51
- chat_history = []
52
- chat_history.append((user_message, response))
53
- return "", chat_history
54
  except Exception as e:
55
- return f"Error: {str(e)}", chat_history
 
 
56
 
57
- # Create the chat interface
58
  demo = gr.ChatInterface(
59
- fn=chat_with_model,
60
- chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
61
- title="Hugging Face Chat Model",
62
- description="Chat with a Hugging Face model using FastLanguageModel.",
 
 
 
63
  )
64
 
65
  # Launch the app
66
  if __name__ == "__main__":
67
- demo.launch()
 
1
  import gradio as gr
 
2
  from unsloth import FastLanguageModel
3
+ from transformers import AutoTokenizer
4
+ import torch
5
 
6
+ # Load the model and tokenizer
7
+ model_name_or_path = "michailroussos/model_llama_8d"
8
  max_seq_length = 2048
9
  dtype = None
 
10
 
11
+ print("Loading model...")
12
  model, tokenizer = FastLanguageModel.from_pretrained(
13
  model_name=model_name_or_path,
14
  max_seq_length=max_seq_length,
15
  dtype=dtype,
16
  load_in_4bit=True,
17
  )
 
 
18
  FastLanguageModel.for_inference(model)
19
+ print("Model loaded successfully!")
20
 
21
+ # Define response function
22
+ def respond(
23
+ message,
24
+ history: list[tuple[str, str]],
25
+ system_message: str,
26
+ max_tokens: int,
27
+ temperature: float,
28
+ top_p: float,
29
+ ):
30
  try:
31
+ # Debug: Print inputs
32
+ print("\n[DEBUG] Incoming user message:", message)
33
+ print("[DEBUG] Chat history before appending:", history)
34
+
35
+ # Prepare messages
36
+ messages = [{"role": "system", "content": system_message}]
37
+ for user, assistant in history:
38
+ if user:
39
+ messages.append({"role": "user", "content": user})
40
+ if assistant:
41
+ messages.append({"role": "assistant", "content": assistant})
42
+ messages.append({"role": "user", "content": message})
43
+
44
+ # Debug: Print prepared messages
45
+ print("[DEBUG] Prepared messages:", messages)
46
+
47
+ # Tokenize and prepare inputs
48
  inputs = tokenizer.apply_chat_template(
49
  messages,
50
  tokenize=True,
 
52
  return_tensors="pt",
53
  ).to("cuda")
54
 
55
+ # Debug: Print tokenized inputs
56
+ print("[DEBUG] Tokenized inputs:", inputs)
57
+
58
  # Generate response
59
  output_ids = model.generate(
60
  input_ids=inputs["input_ids"],
61
+ attention_mask=inputs["attention_mask"],
62
+ max_new_tokens=max_tokens,
63
+ temperature=temperature,
64
+ top_p=top_p,
65
  use_cache=True,
 
 
66
  )
67
 
68
+ # Decode response
69
+ response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
70
+ print("[DEBUG] Decoded response:", response)
71
+
72
+ # Update history
73
+ history.append((message, response))
74
+ return response, history
75
 
 
 
 
 
 
76
  except Exception as e:
77
+ print("[ERROR] Exception in respond function:", str(e))
78
+ return f"Error: {str(e)}", history
79
+
80
 
81
+ # Create ChatInterface
82
  demo = gr.ChatInterface(
83
+ respond,
84
+ additional_inputs=[
85
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
86
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
87
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
88
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
89
+ ],
90
  )
91
 
92
  # Launch the app
93
  if __name__ == "__main__":
94
+ demo.launch(share=True)