Spaces:
Running
Running
File size: 8,558 Bytes
6d6c49f 2df6b47 6d6c49f 87e021b 6d6c49f d32f90c 6d6c49f 2df6b47 6d6c49f d32f90c 6d6c49f d32f90c 6d6c49f d32f90c 6d6c49f 3de9a17 6d6c49f d32f90c 6d6c49f ee2d859 6d6c49f d32f90c 6d6c49f d32f90c 6d6c49f 85c828a 6d6c49f aca2abc 3de9a17 aca2abc d32f90c 6d6c49f d32f90c 6d6c49f 3de9a17 87e021b 85c828a 3de9a17 2df6b47 3de9a17 72d5687 3de9a17 72d5687 3de9a17 d32f90c 6d6c49f 85c828a 6d6c49f 3de9a17 6d6c49f 23078b2 6d6c49f 2df6b47 d32f90c 6d6c49f d32f90c 3de9a17 6d6c49f 3eac2fe 6d6c49f b797037 6d6c49f 3eac2fe ee2d859 3eac2fe ee2d859 6d6c49f ee2d859 6d6c49f 3eac2fe 2df6b47 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import gradio as gr
import torch
import os
import time
# --- Try to import ctransformers for GGUF, provide helpful message if not found ---
try:
from ctransformers import AutoModelForCausalLM as AutoModelForCausalLM_GGUF
# Import LLM directly as it's the actual type of the loaded model
from ctransformers.llm import LLM
from transformers import AutoTokenizer, AutoModelForCausalLM
GGUF_AVAILABLE = True
except ImportError:
GGUF_AVAILABLE = False
print("WARNING: 'ctransformers' not found. This app relies on it for efficient CPU inference.")
print("Please install it with: pip install ctransformers transformers")
from transformers import AutoTokenizer, AutoModelForCausalLM
# --- Configuration for Models and Generation ---
ORIGINAL_MODEL_ID = "HuggingFaceTB/SmolLM2-360M-Instruct"
GGUF_MODEL_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
GGUF_MODEL_FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
# --- Generation Parameters ---
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7
TOP_K = 50
TOP_P = 0.95
DO_SAMPLE = True # This parameter is primarily for Hugging Face transformers.Model.generate()
# Global model and tokenizer
model = None
tokenizer = None
device = "cpu"
# --- Model Loading Function ---
def load_model_for_zerocpu():
global model, tokenizer, device
if GGUF_AVAILABLE:
print(f"Attempting to load GGUF model '{GGUF_MODEL_ID}' (file: '{GGUF_MODEL_FILENAME}') for ZeroCPU...")
try:
model = AutoModelForCausalLM_GGUF.from_pretrained(
GGUF_MODEL_ID,
model_file=GGUF_MODEL_FILENAME,
model_type="llama",
gpu_layers=0
)
# For ctransformers models, the tokenizer is often separate, or not strictly needed for basic chat templates
# We use the original model's tokenizer for consistency and template application.
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"GGUF model '{GGUF_MODEL_ID}' loaded successfully for CPU.")
return
except Exception as e:
print(f"WARNING: Could not load GGUF model '{GGUF_MODEL_ID}' from '{GGUF_MODEL_FILENAME}': {e}")
print(f"Falling back to standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU (will be slower without GGUF quantization).")
else:
print("WARNING: ctransformers is not available. Will load standard Hugging Face model directly.")
print(f"Loading standard Hugging Face model '{ORIGINAL_MODEL_ID}' for CPU...")
try:
model = AutoModelForCausalLM.from_pretrained(ORIGINAL_MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(ORIGINAL_MODEL_ID)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model.to(device)
print(f"Standard model '{ORIGINAL_MODEL_ID}' loaded successfully on CPU.")
except Exception as e:
print(f"CRITICAL ERROR: Could not load standard model '{ORIGINAL_MODEL_ID}' on CPU: {e}")
print("Please ensure the model ID is correct, you have enough RAM, and dependencies are installed.")
model = None
tokenizer = None
# --- Inference Function for Gradio ChatInterface ---
def predict_chat(message: str, history: list):
print(f"Model type in predict_chat: {type(model)}")
if model is None or tokenizer is None:
yield "Error: Model or tokenizer failed to load. Please check the Space logs for details."
return
# Initialize messages list with system message
messages = [{"role": "system", "content": "You are a friendly chatbot."}]
# Extend messages with the existing history directly
# Gradio's gr.Chatbot(type='messages') passes history as a list of dictionaries
# with 'role' and 'content' keys, which is compatible with apply_chat_template.
messages.extend(history)
# Append the current user message
messages.append({"role": "user", "content": message})
generated_text = ""
start_time = time.time()
# CORRECTED: Check against ctransformers.llm.LLM directly and ensure parameters are correct
if GGUF_AVAILABLE and isinstance(model, LLM):
print("Using GGUF model generation path.")
# Apply chat template for GGUF models as well,
# though ctransformers might expect a simpler string.
# For Llama-based models, the tokenizer.apply_chat_template should work.
prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
try:
# Removed do_sample as it's not accepted by ctransformers.LLM.__call__()
for token in model(
prompt_input,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_k=TOP_K,
top_p=TOP_P,
repetition_penalty=1.1,
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
stream=True
):
generated_text += token
yield generated_text
except Exception as e:
print(f"Error in GGUF streaming generation: {e}")
# Fallback to non-streaming generation if streaming fails
# Ensure the output is processed correctly
output = model(
prompt_input,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_k=TOP_K,
top_p=TOP_P,
repetition_penalty=1.1,
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
)
# If not streaming, the 'output' is the complete string
generated_text = output
yield generated_text
else:
print("Using standard Hugging Face model generation path.")
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
# Using stream=True for Hugging Face generation with yield for Gradio
# Note: `model.generate` for Hugging Face `transformers` typically doesn't stream token by token
# in the same way ctransformers does directly. For true streaming with HF models,
# you'd often need a custom generation loop or a specific streaming API.
# For this example, we'll generate the full response and then yield it.
outputs = model.generate(
inputs,
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
temperature=TEMPERATURE,
top_k=TOP_K,
top_p=TOP_P,
do_sample=DO_SAMPLE, # Uncommented for use
pad_token_id=tokenizer.pad_token_id
)
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
yield generated_text
end_time = time.time()
print(f"Inference Time for this turn: {end_time - start_time:.2f} seconds")
# --- Gradio Interface Setup ---
if __name__ == "__main__":
load_model_for_zerocpu()
initial_messages_for_value = [{"role": "assistant", "content":
"Hello! I'm an AI assistant. I'm currently running in a CPU-only "
"environment for efficient demonstration. How can I help you today?"
}]
chatbot_component = gr.Chatbot(height=500, type='messages')
demo = gr.ChatInterface(
fn=predict_chat,
chatbot=chatbot_component,
textbox=gr.Textbox(
placeholder="Ask me a question...",
container=False,
scale=7
),
title="SmolLM2-360M-Instruct (or TinyLlama GGUF) on ZeroCPU",
description=(
f"This Space demonstrates an LLM for efficient CPU-only inference. "
f"**Note:** For ZeroCPU, this app prioritizes `{GGUF_MODEL_ID}` (a GGUF-quantized model "
f"like TinyLlama) due to better CPU performance than `{ORIGINAL_MODEL_ID}` "
f"without GGUF. Expect varied responses each run due to randomized generation."
),
theme="soft",
examples=[
["What is the capital of France?"],
["Can you tell me a fun fact about outer space?"],
["What's the best way to stay motivated?"],
],
cache_examples=False,
)
demo.chatbot.value = initial_messages_for_value
demo.launch() |