michailroussos commited on
Commit
3dc2f1d
·
1 Parent(s): 35ddf38
Files changed (1) hide show
  1. app.py +38 -98
app.py CHANGED
@@ -1,124 +1,64 @@
1
  import gradio as gr
 
2
  from unsloth import FastLanguageModel
3
- import torch
4
 
5
- # Load the model and tokenizer locally
6
  max_seq_length = 2048
 
7
  model_name_or_path = "michailroussos/model_llama_8d"
8
 
9
- # Load model and tokenizer using unsloth
10
  model, tokenizer = FastLanguageModel.from_pretrained(
11
  model_name=model_name_or_path,
12
  max_seq_length=max_seq_length,
 
13
  load_in_4bit=True,
14
  )
15
- FastLanguageModel.for_inference(model) # Enable optimized inference
16
 
17
- # Define the response function
18
- def respond(message, history, system_message, max_tokens, temperature, top_p):
19
- print("\n" + "="*50)
20
- print("===== RESPOND FUNCTION CALLED =====")
21
- print("="*50)
22
-
23
- # Print input parameters
24
- print(f"Input Message: {message}")
25
- print(f"System Message: {system_message}")
26
- print(f"Max Tokens: {max_tokens}")
27
- print(f"Temperature: {temperature}")
28
- print(f"Top-p: {top_p}")
29
-
30
- # Debug history
31
- print("\n--- Current History ---")
32
- print(f"History Type: {type(history)}")
33
- print(f"History Content: {history}")
34
-
35
- # Ensure history is formatted as a list of dictionaries
36
- messages = [{"role": "system", "content": system_message}] # Add system message at the start
37
 
 
 
38
  try:
39
- if history:
40
- print("\n--- Processing Existing History ---")
41
- for entry in history:
42
- # Ensure each history entry is in the correct format
43
- if isinstance(entry, dict) and 'role' in entry and 'content' in entry:
44
- messages.append(entry)
45
- else:
46
- print(f"Skipping malformed history entry: {entry}")
47
 
48
- # Add the current user message
49
- print("\n--- Adding Current Message ---")
50
- messages.append({"role": "user", "content": message})
51
-
52
- # Debug messages before tokenization
53
- print("\n--- Messages Before Tokenization ---")
54
- for msg in messages:
55
- print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
56
-
57
- # Tokenize the input
58
- print("\n--- Tokenizing Input ---")
59
  inputs = tokenizer.apply_chat_template(
60
  messages,
61
  tokenize=True,
62
  add_generation_prompt=True,
63
  return_tensors="pt",
64
- ).to("cuda" if torch.cuda.is_available() else "cpu")
65
-
66
- print(f"Tokenized Inputs Shape: {inputs.shape}")
67
- print(f"Tokenized Inputs Device: {inputs.device}")
68
-
69
  # Generate response
70
- attention_mask = inputs.ne(tokenizer.pad_token_id).long()
71
-
72
- try:
73
- generated_tokens = model.generate(
74
- input_ids=inputs,
75
- attention_mask=attention_mask,
76
- max_new_tokens=max_tokens,
77
- use_cache=True,
78
- temperature=temperature,
79
- top_p=top_p,
80
- )
81
-
82
- # Decode the generated response
83
- response = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
84
- print("\n--- Generated Response ---")
85
- print(f"Raw Response: {response}")
86
-
87
- # Prepare return history in OpenAI messages format
88
- return_messages = [{"role": "user", "content": message},
89
- {"role": "assistant", "content": response}]
90
-
91
- # Add previous conversation turns if any
92
- for entry in (history or []):
93
- return_messages.insert(0, {"role": entry['role'], "content": entry['content']})
94
-
95
- print("\n--- Return Messages ---")
96
- for msg in return_messages:
97
- print(f"Role: {msg['role']}, Content: {msg['content'][:100]}...")
98
-
99
- return return_messages
100
-
101
- except Exception as gen_error:
102
- print("\n--- GENERATION ERROR ---")
103
- print(f"Error during model generation: {gen_error}")
104
- return []
105
-
106
- except Exception as prep_error:
107
- print("\n--- PREPARATION ERROR ---")
108
- print(f"Error during message preparation: {prep_error}")
109
- return []
110
 
111
- # Define the Gradio interface
112
  demo = gr.ChatInterface(
113
- fn=respond,
114
- additional_inputs=[
115
- gr.Textbox(value="You are a helpful assistant.", label="System message"),
116
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max tokens"),
117
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
118
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"),
119
- ],
120
- type="messages" # Explicitly set to messages type
121
  )
122
 
 
123
  if __name__ == "__main__":
124
- demo.launch(share=False) # Use share=False for local testing
 
1
  import gradio as gr
2
+ from transformers import TextStreamer
3
  from unsloth import FastLanguageModel
 
4
 
5
+ # Define constants
6
  max_seq_length = 2048
7
+ dtype = None
8
  model_name_or_path = "michailroussos/model_llama_8d"
9
 
10
+ # Load the model and tokenizer
11
  model, tokenizer = FastLanguageModel.from_pretrained(
12
  model_name=model_name_or_path,
13
  max_seq_length=max_seq_length,
14
+ dtype=dtype,
15
  load_in_4bit=True,
16
  )
 
17
 
18
+ # Optimize model for inference
19
+ FastLanguageModel.for_inference(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Function to generate a response
22
+ def chat_with_model(user_message, chat_history=None):
23
  try:
24
+ # Prepare the input messages
25
+ messages = [{"role": "user", "content": user_message}]
 
 
 
 
 
 
26
 
27
+ # Tokenize and prepare inputs for the model
 
 
 
 
 
 
 
 
 
 
28
  inputs = tokenizer.apply_chat_template(
29
  messages,
30
  tokenize=True,
31
  add_generation_prompt=True,
32
  return_tensors="pt",
33
+ ).to("cuda")
34
+
 
 
 
35
  # Generate response
36
+ text_streamer = TextStreamer(tokenizer, skip_prompt=True)
37
+ output = model.generate(
38
+ input_ids=inputs,
39
+ streamer=text_streamer,
40
+ max_new_tokens=128,
41
+ use_cache=True,
42
+ temperature=1.5,
43
+ min_p=0.1,
44
+ )
45
+
46
+ # Append the response to the chat history
47
+ if chat_history is None:
48
+ chat_history = []
49
+ chat_history.append((user_message, output))
50
+ return "", chat_history
51
+ except Exception as e:
52
+ return f"Error: {str(e)}", chat_history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Create the chat interface
55
  demo = gr.ChatInterface(
56
+ fn=chat_with_model,
57
+ chatbot=gr.Chatbot(label="Chat with Hugging Face Model"),
58
+ title="Hugging Face Chat Model",
59
+ description="Chat with a Hugging Face model using FastLanguageModel.",
 
 
 
 
60
  )
61
 
62
+ # Launch the app
63
  if __name__ == "__main__":
64
+ demo.launch()