Spaces:

michailroussos
/

ID2223_lab2_8D

Runtime error

App Files Files Community

michailroussos commited on Dec 8, 2024

Commit

2ac2f15

1 Parent(s): 4b8149f

changed code to run with no GPU

Browse files

Files changed (2) hide show

app.py +23 -34
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -1,59 +1,47 @@
 import gradio as gr
 from unsloth import FastLanguageModel
-from transformers import TextStreamer
-# Load the model and tokenizer locally
 max_seq_length = 2048
 dtype = None
-model_name_or_path = "michailroussos/model_llama_8d"
-# Load model and tokenizer using unsloth
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name=model_name_or_path,
     max_seq_length=max_seq_length,
     dtype=dtype,
     load_in_4bit=True,
-)
-FastLanguageModel.for_inference(model)  # Enable optimized inference
-# Define the response function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
-    # Build the chat message history
     messages = [{"role": "system", "content": system_message}]
     for val in history:
-        if val[0]:  # User message
             messages.append({"role": "user", "content": val[0]})
-        if val[1]:  # Assistant message
             messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    # Tokenize the input messages
-    inputs = tokenizer.apply_chat_template(
-        messages,
-        tokenize=True,
-        add_generation_prompt=True,  # Required for generation
-        return_tensors="pt",
-    ).to("cuda")
-    # Initialize a TextStreamer for streaming output
-    text_streamer = TextStreamer(tokenizer, skip_prompt=True)
-    # Generate the model's response
     response = ""
-    for output in model.generate(
-        input_ids=inputs,
-        streamer=text_streamer,
-        max_new_tokens=max_tokens,
-        use_cache=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = tokenizer.decode(output, skip_special_tokens=True)
         response += token
         yield response
-# Define the Gradio interface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
@@ -64,5 +52,6 @@ demo = gr.ChatInterface(
     ],
 )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from unsloth import FastLanguageModel
+import torch
+# Set device to CPU
+device = torch.device("cpu")
+model_name_or_path = "michailroussos/model_llama_8d"
 max_seq_length = 2048
 dtype = None
+# Load the model on CPU
 model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=model_name_or_path,  # Your model path
     max_seq_length=max_seq_length,
     dtype=dtype,
     load_in_4bit=True,
+).to(device)  # Make sure the model is on CPU
+# Enable native faster inference if possible
+FastLanguageModel.for_inference(model)
+# Define the inference function
 def respond(message, history, system_message, max_tokens, temperature, top_p):
     messages = [{"role": "system", "content": system_message}]
     for val in history:
+        if val[0]:
             messages.append({"role": "user", "content": val[0]})
+        if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
+    messages.append({"role": "user", "content": message})
     response = ""
+    # Perform inference on CPU
+    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)
+    for message in model.generate(input_ids=inputs['input_ids'], streamer=None, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p):
+        token = message.choices[0].delta.content
         response += token
         yield response
+# Create Gradio interface
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
     ],
 )
+# Launch Gradio app
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-unsloth
-transformers
-gradio
-bitsandbytes

+unsloth==2024.12.4
+transformers==4.47.0
+gradio==5.8.0
+bitsandbytes==0.45.0