fury-bot / main.py
robinroy03's picture
test-llama3, not sure how it'll go on langserve
caa5775
raw
history blame
1.62 kB
from fastapi import FastAPI
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate
from langserve import add_routes
from fastapi.middleware.cors import CORSMiddleware
app = FastAPI(
title="LangChain Server",
version="1.0",
description="A simple api server using Langchain's Runnable interfaces",
)
app.add_middleware(
CORSMiddleware,
allow_origins=['*'],
allow_methods=['*'],
allow_headers=['*'],
allow_credentials=True
)
template = """Give a very concise one word answer to question.
Question: {question}
Answer:
"""
prompt = PromptTemplate.from_template(template)
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
n_gpu_layers = -1 # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
# Make sure the model path is correct for your system!
llm = LlamaCpp(
model_path="Meta-Llama-3-8B-Instruct-v2.Q4_K_S.gguf",
n_gpu_layers=n_gpu_layers,
n_batch=n_batch,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
)
add_routes(
app,
prompt | llm,
path='/test'
)
if __name__ == "__main__":
import uvicorn
uvicorn.run(app)
# llm_chain = prompt | llm
# question = "Hi"
# x = llm_chain.invoke({"question": question})