from fastapi import FastAPI, Request from llama_cpp import Llama from huggingface_hub import hf_hub_download import os os.system("ulimit -l") app = FastAPI() hf_hub_download("TheBloke/deepseek-coder-1.3b-instruct-GGUF", "deepseek-coder-1.3b-instruct.Q5_K_M.gguf", local_dir="./") model_l = Llama(model_path="./deepseek-coder-1.3b-instruct.Q5_K_M.gguf", n_ctx=16000, n_gpu_layers=0, n_threads=2, use_mlock=True) @app.get("/check") async def index(): return {"msg": "Hey!"} @app.post("/api") async def completion(request: Request): data = await request.json() prompt = data["prompt"] prompt = f"Complete the following code, do not comment(return only the completed code!):\n\n{prompt}" res = model_l( prompt, temperature=0.7, echo=False, max_tokens=64, ) return {"responses": res["choices"]} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)