michailroussos commited on
Commit
7c34777
·
1 Parent(s): e82c023
Files changed (1) hide show
  1. app.py +32 -64
app.py CHANGED
@@ -1,13 +1,14 @@
1
  import gradio as gr
 
2
  from unsloth import FastLanguageModel
3
- from transformers import AutoTokenizer
4
  import torch
5
 
6
- # Load the model and tokenizer
7
- model_name_or_path = "michailroussos/model_llama_8d"
8
  max_seq_length = 2048
9
  dtype = None
 
10
 
 
11
  print("Loading model...")
12
  model, tokenizer = FastLanguageModel.from_pretrained(
13
  model_name=model_name_or_path,
@@ -15,85 +16,52 @@ model, tokenizer = FastLanguageModel.from_pretrained(
15
  dtype=dtype,
16
  load_in_4bit=True,
17
  )
18
- FastLanguageModel.for_inference(model)
19
  print("Model loaded successfully!")
20
 
21
- # Define response function
22
- def respond(
23
- message,
24
- history: list[tuple[str, str]],
25
- system_message: str,
26
- max_tokens: int,
27
- temperature: float,
28
- top_p: float,
29
- ):
30
  try:
31
- # Debug: Print inputs
32
- print("\n[DEBUG] Incoming user message:", message)
33
- print("[DEBUG] Chat history before appending:", history)
34
-
35
- # Prepare messages
36
- messages = [{"role": "system", "content": system_message}]
37
- for user, assistant in history:
38
- if user:
39
- messages.append({"role": "user", "content": user})
40
- if assistant:
41
- messages.append({"role": "assistant", "content": assistant})
42
  messages.append({"role": "user", "content": message})
43
 
44
- # Debug: Print prepared messages
45
- print("[DEBUG] Prepared messages:", messages)
46
-
47
- # Tokenize and prepare inputs
48
  inputs = tokenizer.apply_chat_template(
49
  messages,
50
  tokenize=True,
51
  add_generation_prompt=True,
52
  return_tensors="pt",
53
- )
54
-
55
- # Ensure tensor shapes are correct
56
- input_ids = inputs["input_ids"].squeeze(0).to("cuda")
57
- attention_mask = inputs["attention_mask"].squeeze(0).to("cuda")
58
 
59
- # Debug: Print tokenized inputs
60
- print("[DEBUG] Tokenized input_ids shape:", input_ids.shape)
61
- print("[DEBUG] Tokenized attention_mask shape:", attention_mask.shape)
62
-
63
- # Generate response
64
- output_ids = model.generate(
65
- input_ids=input_ids,
66
- attention_mask=attention_mask,
67
- max_new_tokens=max_tokens,
68
  temperature=temperature,
69
- top_p=top_p,
70
  use_cache=True,
 
71
  )
72
-
73
- # Decode response
74
- response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
75
- print("[DEBUG] Decoded response:", response)
76
-
77
- # Update history
78
- history.append((message, response))
79
- return response, history
80
-
81
  except Exception as e:
82
- print("[ERROR] Exception in respond function:", str(e))
83
- return f"Error: {str(e)}", history
84
-
85
 
86
- # Create ChatInterface
87
- demo = gr.ChatInterface(
88
- respond,
89
- additional_inputs=[
90
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
91
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
92
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
93
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
94
  ],
 
 
 
95
  )
96
 
97
- # Launch the app
98
  if __name__ == "__main__":
99
  demo.launch(share=True)
 
1
  import gradio as gr
2
+ from transformers import TextStreamer
3
  from unsloth import FastLanguageModel
 
4
  import torch
5
 
6
+ # Model Configuration
 
7
  max_seq_length = 2048
8
  dtype = None
9
+ model_name_or_path = "michailroussos/model_llama_8d"
10
 
11
+ # Load Model and Tokenizer
12
  print("Loading model...")
13
  model, tokenizer = FastLanguageModel.from_pretrained(
14
  model_name=model_name_or_path,
 
16
  dtype=dtype,
17
  load_in_4bit=True,
18
  )
19
+ FastLanguageModel.for_inference(model) # Enable faster inference
20
  print("Model loaded successfully!")
21
 
22
+ # Gradio Response Function
23
+ def respond(message, max_new_tokens, temperature, system_message=""):
 
 
 
 
 
 
 
24
  try:
25
+ # Prepare input messages
26
+ messages = [{"role": "system", "content": system_message}] if system_message else []
 
 
 
 
 
 
 
 
 
27
  messages.append({"role": "user", "content": message})
28
 
29
+ # Tokenize inputs
 
 
 
30
  inputs = tokenizer.apply_chat_template(
31
  messages,
32
  tokenize=True,
33
  add_generation_prompt=True,
34
  return_tensors="pt",
35
+ ).to("cuda")
 
 
 
 
36
 
37
+ # Stream response
38
+ response = []
39
+ text_streamer = TextStreamer(tokenizer, skip_prompt=True)
40
+ _ = model.generate(
41
+ input_ids=inputs["input_ids"],
42
+ attention_mask=inputs["attention_mask"],
43
+ max_new_tokens=max_new_tokens,
 
 
44
  temperature=temperature,
 
45
  use_cache=True,
46
+ streamer=text_streamer,
47
  )
48
+ return "".join(response)
 
 
 
 
 
 
 
 
49
  except Exception as e:
50
+ return f"Error: {str(e)}"
 
 
51
 
52
+ # Gradio UI
53
+ demo = gr.Interface(
54
+ fn=respond,
55
+ inputs=[
56
+ gr.Textbox(label="Your Message", placeholder="Enter your prompt here..."),
57
+ gr.Slider(minimum=1, maximum=512, step=1, value=128, label="Max New Tokens"),
58
+ gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
59
+ gr.Textbox(label="System Message", placeholder="Optional system instructions."),
60
  ],
61
+ outputs="text",
62
+ title="LLama-based Chatbot",
63
+ description="Interact with the model. Enter a prompt and receive a response.",
64
  )
65
 
 
66
  if __name__ == "__main__":
67
  demo.launch(share=True)