Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import torch | |
| from unsloth import FastLanguageModel | |
| from huggingface_hub import spaces | |
| # Get Hugging Face token from environment variables | |
| HF_TOKEN = os.environ.get('HF_TOKEN') | |
| # Check if we're running in a Hugging Face Space with GPU constraints | |
| IS_SPACES_ZERO = os.environ.get("SPACES_ZERO_GPU", "0") == "1" | |
| IS_SPACE = os.environ.get("SPACE_ID", None) is not None | |
| # Determine device (use GPU if available) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| LOW_MEMORY = os.getenv("LOW_MEMORY", "0") == "1" | |
| print(f"Using device: {device}") | |
| print(f"Low memory mode: {LOW_MEMORY}") | |
| # Model configuration | |
| max_seq_length = 2048 # Max sequence length for RoPE scaling | |
| dtype = torch.float16 if device == "cuda" else torch.float32 | |
| load_in_4bit = True # Enable 4-bit quantization if memory is limited | |
| # Load model and tokenizer with device mapping | |
| model_name = "nafisneehal/chandler_bot" | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=model_name, | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| device_map="auto" if device == "cuda" else None # Automatic GPU mapping | |
| ) | |
| FastLanguageModel.for_inference(model) # Optimize model for faster inference | |
| # Define prompt structure (update if necessary for your model) | |
| alpaca_prompt = "{instruction} {input} {output}" | |
| instruction_text = "Learn how to talk like Chandler - a popular character from FRIENDS TV Show. Input is someone saying something, Output is what Chandler saying in response." | |
| # Use GPU provided by Hugging Face Spaces if available | |
| def generate_response(user_input, chat_history): | |
| instruction = user_input # Treats user input as instruction | |
| input_text = "" # Any additional input if needed; empty otherwise | |
| # Prepare inputs for model inference on the correct device | |
| inputs = tokenizer( | |
| [alpaca_prompt.format(instruction, input_text, "")], | |
| return_tensors="pt" | |
| ).to(device) # Ensure tensors are on the correct device | |
| # Generate response on GPU or CPU as appropriate | |
| with torch.no_grad(): | |
| outputs = model.generate(**inputs, max_new_tokens=100) | |
| # Decode response | |
| bot_reply = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Update chat history with user and bot interactions | |
| chat_history.append(("User", user_input)) | |
| chat_history.append(("Bot", bot_reply)) | |
| return chat_history, "" # Returns updated chat history and clears input | |
| # Set up Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Llama-Based Chatbot on GPU") | |
| chat_history = gr.Chatbot(label="Chat History") | |
| user_input = gr.Textbox( | |
| placeholder="Type your message here...", label="Your Message") | |
| # Connect submit actions to generate response function | |
| user_input.submit(generate_response, [user_input, chat_history], [ | |
| chat_history, user_input]) | |
| submit_btn = gr.Button("Send") | |
| submit_btn.click(generate_response, [user_input, chat_history], [ | |
| chat_history, user_input]) | |
| demo.launch() | |