Spaces:

ajsbsd
/

smollm2-zerocpu-demo

Running

File size: 8,558 Bytes

6d6c49f
2df6b47
6d6c49f
 
 
 
 
 
87e021b
 
6d6c49f
 
 
 
 
 
 
 
 
 
d32f90c
 
6d6c49f
 
 
 
 
 
2df6b47
6d6c49f
d32f90c
6d6c49f
 
d32f90c
6d6c49f
 
 
 
 
 
 
 
 
 
 
d32f90c
 
6d6c49f
3de9a17
 
6d6c49f
 
 
 
d32f90c
6d6c49f
 
 
 
 
ee2d859
6d6c49f
 
 
 
 
 
d32f90c
6d6c49f
 
 
 
d32f90c
 
6d6c49f
 
 
85c828a
 
6d6c49f
 
 
 
aca2abc
3de9a17
aca2abc
 
 
 
 
 
 
d32f90c
6d6c49f
 
d32f90c
6d6c49f
3de9a17
87e021b
85c828a
3de9a17
 
 
 
 
 
2df6b47
3de9a17
 
 
 
 
 
72d5687
 
 
 
 
 
 
3de9a17
 
 
72d5687
 
 
 
 
 
 
 
 
3de9a17
 
d32f90c
6d6c49f
85c828a
 
6d6c49f
 
 
3de9a17
 
 
 
 
6d6c49f
 
23078b2
6d6c49f
 
 
2df6b47
d32f90c
6d6c49f
 
d32f90c
3de9a17
6d6c49f
 
 
 
 
 
 
3eac2fe
6d6c49f
 
b797037
6d6c49f
3eac2fe
 
ee2d859
 
3eac2fe
ee2d859
 
 
 
 
 
 
6d6c49f
 
 
 
ee2d859
 
 
 
 
 
 
 
 
6d6c49f
3eac2fe
 
2df6b47

import gradio as gr
import torch
import os
import time

# --- Try to import ctransformers for GGUF, provide helpful message if not found ---
try:
    from ctransformers import AutoModelForCausalLM as AutoModelForCausalLM_GGUF
    # Import LLM directly as it's the actual type of the loaded model
    from ctransformers.llm import LLM 
    from transformers import AutoTokenizer, AutoModelForCausalLM 
    GGUF_AVAILABLE = True
except ImportError:
    GGUF_AVAILABLE = False
    print("WARNING: 'ctransformers' not found. This app relies on it for efficient CPU inference.")
    print("Please install it with: pip install ctransformers transformers")
    from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Configuration for Models and Generation ---
ORIGINAL_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
GGUF_MODEL_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" 
GGUF_MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" 

# --- Generation Parameters ---
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7
TOP_K = 50
TOP_P = 0.95
DO_SAMPLE = True # This parameter is primarily for Hugging Face transformers.Model.generate()

# Global model and tokenizer
model = None
tokenizer = None
device = "cpu" 

# --- Model Loading Function ---
def load_model_for_zerocpu():
    global model, tokenizer, device

    if GGUF_AVAILABLE:
        print(f"Attempting to load GGUF model '{GGUF_MODEL_ID}' (file: '{GGUF_MODEL_FILENAME}') for ZeroCPU...")
        try:
            model = AutoModelForCausalLM_GGUF.from_pretrained(
                GGUF_MODEL_ID,
                model_file=GGUF_MODEL_FILENAME,
                model_type="llama", 
                gpu_layers=0 
            )
            # For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
            # We use the original model's tokenizer for consistency and template application.
            tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
            if tokenizer.pad_token is None:
                tokenizer.pad_token = tokenizer.eos_token
            print(f"GGUF model '{GGUF_MODEL_ID}' loaded successfully for CPU.")
            return 
        except Exception as e:
            print(f"WARNING: Could not load GGUF model '{GGUF_MODEL_ID}' from '{GGUF_MODEL_FILENAME}': {e}")
            print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
    else:
        print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
    
    print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
    try:
        model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
        tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model.to(device) 
        print(f"Standard model '{ORIGINAL_MODEL_ID}' loaded successfully on CPU.")
    except Exception as e:
        print(f"CRITICAL ERROR: Could not load standard model '{ORIGINAL_MODEL_ID}' on CPU: {e}")
        print("Please ensure the model ID is correct, you have enough RAM, and dependencies are installed.")
        model = None 
        tokenizer = None 

# --- Inference Function for Gradio ChatInterface ---
def predict_chat(message: str, history: list):
    print(f"Model type in predict_chat: {type(model)}")

    if model is None or tokenizer is None:
        yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
        return

    # Initialize messages list with system message
    messages = [{"role": "system", "content": "You are a friendly chatbot."}]
    
    # Extend messages with the existing history directly
    # Gradio's gr.Chatbot(type='messages') passes history as a list of dictionaries 
    # with 'role' and 'content' keys, which is compatible with apply_chat_template.
    messages.extend(history)

    # Append the current user message
    messages.append({"role": "user", "content": message})

    generated_text = ""
    start_time = time.time()

    # CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
    if GGUF_AVAILABLE and isinstance(model, LLM): 
        print("Using GGUF model generation path.")
        # Apply chat template for GGUF models as well,
        # though ctransformers might expect a simpler string.
        # For Llama-based models, the tokenizer.apply_chat_template should work.
        prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        try:
            # Removed do_sample as it's not accepted by ctransformers.LLM.__call__()
            for token in model(
                prompt_input,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                top_k=TOP_K,
                top_p=TOP_P,
                repetition_penalty=1.1,
                stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
                stream=True
            ):
                generated_text += token
                yield generated_text
        except Exception as e:
            print(f"Error in GGUF streaming generation: {e}")
            # Fallback to non-streaming generation if streaming fails
            # Ensure the output is processed correctly
            output = model(
                prompt_input,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=TEMPERATURE,
                top_k=TOP_K,
                top_p=TOP_P,
                repetition_penalty=1.1,
                stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
            )
            # If not streaming, the 'output' is the complete string
            generated_text = output 
            yield generated_text 

    else: 
        print("Using standard Hugging Face model generation path.")
        input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)

        # Using stream=True for Hugging Face generation with yield for Gradio
        # Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
        # in the same way ctransformers does directly. For true streaming with HF models,
        # you'd often need a custom generation loop or a specific streaming API.
        # For this example, we'll generate the full response and then yield it.
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[-1] + MAX_NEW_TOKENS, 
            temperature=TEMPERATURE,
            top_k=TOP_K,
            top_p=TOP_P,
            do_sample=DO_SAMPLE, # Uncommented for use
            pad_token_id=tokenizer.pad_token_id
        )
        generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
        yield generated_text
    
    end_time = time.time()
    print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")

# --- Gradio Interface Setup ---
if __name__ == "__main__":
    load_model_for_zerocpu()

    initial_messages_for_value = [{"role": "assistant", "content": 
        "Hello! I'm an AI assistant. I'm currently running in a CPU-only "
        "environment for efficient demonstration. How can I help you today?"
    }]

    chatbot_component = gr.Chatbot(height=500, type='messages')
    
    demo = gr.ChatInterface(
        fn=predict_chat,
        chatbot=chatbot_component, 
        textbox=gr.Textbox(
            placeholder="Ask me a question...",
            container=False,
            scale=7
        ),
        title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
        description=(
            f"This Space demonstrates an LLM for efficient CPU-only inference. "
            f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
            f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
            f"without GGUF. Expect varied responses each run due to randomized generation."
        ),
        theme="soft",
        examples=[ 
            ["What is the capital of France?"],
            ["Can you tell me a fun fact about outer space?"],
            ["What's the best way to stay motivated?"],
        ],
        cache_examples=False,
    )

    demo.chatbot.value = initial_messages_for_value 

    demo.launch()