from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama import os # Define the FastAPI app app = FastAPI() # Path to the GGUF model file MODEL_NAME = "SmolVLM-500M-Instruct-GGUF.Q4_K_M.gguf" MODEL_PATH = f"./{MODEL_NAME}" # Download the model from the Hub if it's not present if not os.path.exists(MODEL_PATH): from huggingface_hub import hf_hub_download hf_hub_download( repo_id="ggml-org/SmolVLM-500M-Instruct-GGUF", filename=MODEL_NAME, local_dir=".", local_dir_use_symlinks=False ) # Load the Llama model try: llm = Llama(model_path=MODEL_PATH, n_ctx=2048, verbose=False) except Exception as e: print(f"Error loading model: {e}") llm = None class InferenceRequest(BaseModel): prompt: str @app.post("/generate") def generate_text(request: InferenceRequest): if llm is None: return {"error": "Model not loaded"}, 500 try: output = llm.create_completion( prompt=request.prompt, max_tokens=256, stop=["<|im_end|>", ""], temperature=0.7 ) return {"text": output["choices"][0]["text"].strip()} except Exception as e: return {"error": str(e)}, 500 @app.get("/") def health_check(): return {"status": "ok", "model_loaded": llm is not None}