michailroussos commited on
Commit
bad2083
·
1 Parent(s): ebd9e26
Files changed (1) hide show
  1. app.py +4 -16
app.py CHANGED
@@ -1,25 +1,21 @@
1
  import gradio as gr
2
  from unsloth import FastLanguageModel
3
- from transformers import TextStreamer
4
  import torch
5
 
6
  # Load the model and tokenizer locally
7
  max_seq_length = 2048
8
- dtype = None
9
  model_name_or_path = "michailroussos/model_llama_8d"
10
 
11
  # Load model and tokenizer using unsloth
12
  model, tokenizer = FastLanguageModel.from_pretrained(
13
  model_name=model_name_or_path,
14
  max_seq_length=max_seq_length,
15
- dtype=dtype,
16
  load_in_4bit=True,
17
  )
18
  FastLanguageModel.for_inference(model) # Enable optimized inference
19
 
20
  # Define the response function
21
  def respond(message, history, system_message, max_tokens, temperature, top_p):
22
- # Combine system message and conversation history
23
  messages = [{"role": "system", "content": system_message}]
24
  for user_msg, assistant_msg in history:
25
  if user_msg:
@@ -28,17 +24,15 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
28
  messages.append({"role": "assistant", "content": assistant_msg})
29
  messages.append({"role": "user", "content": message})
30
 
31
- # Tokenize inputs
32
  inputs = tokenizer.apply_chat_template(
33
  messages,
34
  tokenize=True,
35
  add_generation_prompt=True,
36
  return_tensors="pt",
37
  ).to("cuda" if torch.cuda.is_available() else "cpu")
38
-
39
- attention_mask = inputs.ne(tokenizer.pad_token_id).long() # Explicitly set attention mask
40
-
41
- # Generate response tokens
42
  generated_tokens = model.generate(
43
  input_ids=inputs,
44
  attention_mask=attention_mask,
@@ -47,15 +41,9 @@ def respond(message, history, system_message, max_tokens, temperature, top_p):
47
  temperature=temperature,
48
  top_p=top_p,
49
  )
50
-
51
- # Decode generated tokens
52
  response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
53
-
54
- # Yield response in the required Gradio format
55
  yield [{"role": "assistant", "content": response}]
56
 
57
-
58
-
59
  # Define the Gradio interface
60
  demo = gr.ChatInterface(
61
  respond,
@@ -65,7 +53,7 @@ demo = gr.ChatInterface(
65
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
66
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
67
  ],
68
- type="messages"
69
  )
70
 
71
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  from unsloth import FastLanguageModel
 
3
  import torch
4
 
5
  # Load the model and tokenizer locally
6
  max_seq_length = 2048
 
7
  model_name_or_path = "michailroussos/model_llama_8d"
8
 
9
  # Load model and tokenizer using unsloth
10
  model, tokenizer = FastLanguageModel.from_pretrained(
11
  model_name=model_name_or_path,
12
  max_seq_length=max_seq_length,
 
13
  load_in_4bit=True,
14
  )
15
  FastLanguageModel.for_inference(model) # Enable optimized inference
16
 
17
  # Define the response function
18
  def respond(message, history, system_message, max_tokens, temperature, top_p):
 
19
  messages = [{"role": "system", "content": system_message}]
20
  for user_msg, assistant_msg in history:
21
  if user_msg:
 
24
  messages.append({"role": "assistant", "content": assistant_msg})
25
  messages.append({"role": "user", "content": message})
26
 
 
27
  inputs = tokenizer.apply_chat_template(
28
  messages,
29
  tokenize=True,
30
  add_generation_prompt=True,
31
  return_tensors="pt",
32
  ).to("cuda" if torch.cuda.is_available() else "cpu")
33
+
34
+ attention_mask = inputs.ne(tokenizer.pad_token_id).long()
35
+
 
36
  generated_tokens = model.generate(
37
  input_ids=inputs,
38
  attention_mask=attention_mask,
 
41
  temperature=temperature,
42
  top_p=top_p,
43
  )
 
 
44
  response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
 
 
45
  yield [{"role": "assistant", "content": response}]
46
 
 
 
47
  # Define the Gradio interface
48
  demo = gr.ChatInterface(
49
  respond,
 
53
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
54
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
55
  ],
56
+ type="messages",
57
  )
58
 
59
  if __name__ == "__main__":