File size: 1,342 Bytes
9c08c69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import os

# Define the FastAPI app
app = FastAPI()

# Path to the GGUF model file
MODEL_NAME = "SmolVLM-500M-Instruct-GGUF.Q4_K_M.gguf"
MODEL_PATH = f"./{MODEL_NAME}"

# Download the model from the Hub if it's not present
if not os.path.exists(MODEL_PATH):
    from huggingface_hub import hf_hub_download
    hf_hub_download(
        repo_id="ggml-org/SmolVLM-500M-Instruct-GGUF",
        filename=MODEL_NAME,
        local_dir=".",
        local_dir_use_symlinks=False
    )

# Load the Llama model
try:
    llm = Llama(model_path=MODEL_PATH, n_ctx=2048, verbose=False)
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None

class InferenceRequest(BaseModel):
    prompt: str

@app.post("/generate")
def generate_text(request: InferenceRequest):
    if llm is None:
        return {"error": "Model not loaded"}, 500

    try:
        output = llm.create_completion(
            prompt=request.prompt,
            max_tokens=256,
            stop=["<|im_end|>", "</s>"],
            temperature=0.7
        )
        return {"text": output["choices"][0]["text"].strip()}
    except Exception as e:
        return {"error": str(e)}, 500

@app.get("/")
def health_check():
    return {"status": "ok", "model_loaded": llm is not None}