Spaces:
Sleeping
Sleeping
from fastapi import FastAPI | |
from langchain_community.llms import LlamaCpp | |
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler | |
from langchain_core.prompts import PromptTemplate | |
from langserve import add_routes | |
from fastapi.middleware.cors import CORSMiddleware | |
app = FastAPI( | |
title="LangChain Server", | |
version="1.0", | |
description="A simple api server using Langchain's Runnable interfaces", | |
) | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=['*'], | |
allow_methods=['*'], | |
allow_headers=['*'], | |
allow_credentials=True | |
) | |
template = """Give a very concise one word answer to question. | |
Question: {question} | |
Answer: | |
""" | |
prompt = PromptTemplate.from_template(template) | |
# Callbacks support token-wise streaming | |
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) | |
n_gpu_layers = -1 # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU. | |
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. | |
# Make sure the model path is correct for your system! | |
llm = LlamaCpp( | |
model_path="Meta-Llama-3-8B-Instruct-v2.Q4_K_S.gguf", | |
n_gpu_layers=n_gpu_layers, | |
n_batch=n_batch, | |
callback_manager=callback_manager, | |
verbose=True, # Verbose is required to pass to the callback manager | |
) | |
add_routes( | |
app, | |
prompt | llm, | |
path='/test' | |
) | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app) | |
# llm_chain = prompt | llm | |
# question = "Hi" | |
# x = llm_chain.invoke({"question": question}) |