from fastapi import FastAPI from langchain_community.llms import LlamaCpp from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler from langchain_core.prompts import PromptTemplate from langserve import add_routes from fastapi.middleware.cors import CORSMiddleware app = FastAPI( title="LangChain Server", version="1.0", description="A simple api server using Langchain's Runnable interfaces", ) app.add_middleware( CORSMiddleware, allow_origins=['*'], allow_methods=['*'], allow_headers=['*'], allow_credentials=True ) template = """Give a very concise one word answer to question. Question: {question} Answer: """ prompt = PromptTemplate.from_template(template) # Callbacks support token-wise streaming callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) n_gpu_layers = -1 # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU. n_batch = 512 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. # Make sure the model path is correct for your system! llm = LlamaCpp( model_path="Meta-Llama-3-8B-Instruct-v2.Q4_K_S.gguf", n_gpu_layers=n_gpu_layers, n_batch=n_batch, callback_manager=callback_manager, verbose=True, # Verbose is required to pass to the callback manager ) add_routes( app, prompt | llm, path='/test' ) if __name__ == "__main__": import uvicorn uvicorn.run(app) # llm_chain = prompt | llm # question = "Hi" # x = llm_chain.invoke({"question": question})