michailroussos commited on
Commit
2ac2f15
·
1 Parent(s): 4b8149f

changed code to run with no GPU

Browse files
Files changed (2) hide show
  1. app.py +23 -34
  2. requirements.txt +4 -4
app.py CHANGED
@@ -1,59 +1,47 @@
1
  import gradio as gr
2
  from unsloth import FastLanguageModel
3
- from transformers import TextStreamer
4
 
5
- # Load the model and tokenizer locally
 
 
 
6
  max_seq_length = 2048
7
  dtype = None
8
- model_name_or_path = "michailroussos/model_llama_8d"
9
 
10
- # Load model and tokenizer using unsloth
11
  model, tokenizer = FastLanguageModel.from_pretrained(
12
- model_name=model_name_or_path,
13
  max_seq_length=max_seq_length,
14
  dtype=dtype,
15
  load_in_4bit=True,
16
- )
17
- FastLanguageModel.for_inference(model) # Enable optimized inference
 
 
18
 
19
- # Define the response function
20
  def respond(message, history, system_message, max_tokens, temperature, top_p):
21
- # Build the chat message history
22
  messages = [{"role": "system", "content": system_message}]
 
23
  for val in history:
24
- if val[0]: # User message
25
  messages.append({"role": "user", "content": val[0]})
26
- if val[1]: # Assistant message
27
  messages.append({"role": "assistant", "content": val[1]})
28
- messages.append({"role": "user", "content": message})
29
-
30
- # Tokenize the input messages
31
- inputs = tokenizer.apply_chat_template(
32
- messages,
33
- tokenize=True,
34
- add_generation_prompt=True, # Required for generation
35
- return_tensors="pt",
36
- ).to("cuda")
37
 
38
- # Initialize a TextStreamer for streaming output
39
- text_streamer = TextStreamer(tokenizer, skip_prompt=True)
40
 
41
- # Generate the model's response
42
  response = ""
43
- for output in model.generate(
44
- input_ids=inputs,
45
- streamer=text_streamer,
46
- max_new_tokens=max_tokens,
47
- use_cache=True,
48
- temperature=temperature,
49
- top_p=top_p,
50
- ):
51
- token = tokenizer.decode(output, skip_special_tokens=True)
52
  response += token
53
  yield response
54
 
55
-
56
- # Define the Gradio interface
57
  demo = gr.ChatInterface(
58
  respond,
59
  additional_inputs=[
@@ -64,5 +52,6 @@ demo = gr.ChatInterface(
64
  ],
65
  )
66
 
 
67
  if __name__ == "__main__":
68
  demo.launch()
 
1
  import gradio as gr
2
  from unsloth import FastLanguageModel
3
+ import torch
4
 
5
+ # Set device to CPU
6
+ device = torch.device("cpu")
7
+
8
+ model_name_or_path = "michailroussos/model_llama_8d"
9
  max_seq_length = 2048
10
  dtype = None
 
11
 
12
+ # Load the model on CPU
13
  model, tokenizer = FastLanguageModel.from_pretrained(
14
+ model_name=model_name_or_path, # Your model path
15
  max_seq_length=max_seq_length,
16
  dtype=dtype,
17
  load_in_4bit=True,
18
+ ).to(device) # Make sure the model is on CPU
19
+
20
+ # Enable native faster inference if possible
21
+ FastLanguageModel.for_inference(model)
22
 
23
+ # Define the inference function
24
  def respond(message, history, system_message, max_tokens, temperature, top_p):
 
25
  messages = [{"role": "system", "content": system_message}]
26
+
27
  for val in history:
28
+ if val[0]:
29
  messages.append({"role": "user", "content": val[0]})
30
+ if val[1]:
31
  messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
32
 
33
+ messages.append({"role": "user", "content": message})
 
34
 
 
35
  response = ""
36
+
37
+ # Perform inference on CPU
38
+ inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)
39
+ for message in model.generate(input_ids=inputs['input_ids'], streamer=None, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p):
40
+ token = message.choices[0].delta.content
 
 
 
 
41
  response += token
42
  yield response
43
 
44
+ # Create Gradio interface
 
45
  demo = gr.ChatInterface(
46
  respond,
47
  additional_inputs=[
 
52
  ],
53
  )
54
 
55
+ # Launch Gradio app
56
  if __name__ == "__main__":
57
  demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- unsloth
2
- transformers
3
- gradio
4
- bitsandbytes
 
1
+ unsloth==2024.12.4
2
+ transformers==4.47.0
3
+ gradio==5.8.0
4
+ bitsandbytes==0.45.0