import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_NAME = "mistralai/Mistral-7B-v0.3" token = os.getenv("HF_API_TOKEN") # This must be set in the Space's secrets print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, token=token, ) print("Loading model in 4-bit...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto", # Free Space => CPU load_in_4bit=True, trust_remote_code=True, token=token, ) model.eval() def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=128, temperature=0.7, repetition_penalty=1.2, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) demo = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=3, label="Your Prompt"), outputs=gr.Textbox(label="Mistral 7B Response"), title="Mistral 7B (4-bit) Chat", description=( "A minimal Mistral 7B example on free CPU. " "It'll be slow and can OOM with big prompts." ), ) if __name__ == "__main__": demo.launch()