|  | import gradio as gr | 
					
						
						|  |  | 
					
						
						|  | from langchain_community.llms import LlamaCpp | 
					
						
						|  | from langchain.prompts import PromptTemplate | 
					
						
						|  | from langchain.chains import LLMChain | 
					
						
						|  | from langchain_core.callbacks import StreamingStdOutCallbackHandler | 
					
						
						|  | from langchain.retrievers import TFIDFRetriever | 
					
						
						|  | from langchain.chains import RetrievalQA | 
					
						
						|  | from langchain.memory import ConversationBufferMemory | 
					
						
						|  |  | 
					
						
						|  | from unsloth import FastLanguageModel | 
					
						
						|  | import torch | 
					
						
						|  | max_seq_length = 2048 | 
					
						
						|  | dtype = None | 
					
						
						|  | load_in_4bit = True | 
					
						
						|  |  | 
					
						
						|  | model, tokenizer = FastLanguageModel.from_pretrained( | 
					
						
						|  | model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m", | 
					
						
						|  | max_seq_length = max_seq_length, | 
					
						
						|  | dtype = dtype, | 
					
						
						|  | load_in_4bit = load_in_4bit, | 
					
						
						|  |  | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | from langchain_huggingface.llms import HuggingFacePipeline | 
					
						
						|  | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | 
					
						
						|  | FastLanguageModel.for_inference(model) | 
					
						
						|  |  | 
					
						
						|  | pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256) | 
					
						
						|  | from langchain_community.llms import HuggingFaceEndpoint | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | gpu_llm = HuggingFacePipeline( | 
					
						
						|  | pipeline=pipe, | 
					
						
						|  | batch_size=5, | 
					
						
						|  | model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True}, | 
					
						
						|  |  | 
					
						
						|  | ) | 
					
						
						|  | from langchain_core.prompts import PromptTemplate | 
					
						
						|  | from langchain.chains import LLMChain | 
					
						
						|  | from langchain.schema import HumanMessage, SystemMessage, AIMessage | 
					
						
						|  |  | 
					
						
						|  | alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | 
					
						
						|  |  | 
					
						
						|  | ### Instruction: | 
					
						
						|  | {question} | 
					
						
						|  |  | 
					
						
						|  | ### Input: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ### Response: | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | prompt = PromptTemplate.from_template(alpaca_prompt_simple) | 
					
						
						|  | llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from langchain.prompts import  ChatPromptTemplate, FewShotChatMessagePromptTemplate | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | examples = [ | 
					
						
						|  | { | 
					
						
						|  | "query": "what is forex?", | 
					
						
						|  | "answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price." | 
					
						
						|  | }, | 
					
						
						|  | ] | 
					
						
						|  | example_prompt = ChatPromptTemplate.from_messages( | 
					
						
						|  | [ | 
					
						
						|  | ("human", "{query}"), | 
					
						
						|  | ("ai", "{answer}"), | 
					
						
						|  | ] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | few_shot_prompt = FewShotChatMessagePromptTemplate( | 
					
						
						|  | example_prompt=example_prompt, | 
					
						
						|  | examples=examples, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from langchain_core.prompts import PromptTemplate | 
					
						
						|  | from langchain.memory import ConversationBufferMemory | 
					
						
						|  |  | 
					
						
						|  | alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | 
					
						
						|  | {chat_history} | 
					
						
						|  |  | 
					
						
						|  | ### Instruction: | 
					
						
						|  |  | 
					
						
						|  | {question} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | ### Input: | 
					
						
						|  |  | 
					
						
						|  | ### Response: | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | prompt = PromptTemplate( | 
					
						
						|  | input_variables=["chat_history", "question"], template=alpaca_prompt_memory | 
					
						
						|  | ) | 
					
						
						|  | memory = ConversationBufferMemory(memory_key="chat_history") | 
					
						
						|  |  | 
					
						
						|  | llm_chain_memory = LLMChain( | 
					
						
						|  | llm=gpu_llm.bind(skip_prompt=True), | 
					
						
						|  | prompt=prompt, | 
					
						
						|  | verbose=True, | 
					
						
						|  | memory=memory, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def greet(question, model_type): | 
					
						
						|  | print(f"question is {question}") | 
					
						
						|  | if model_type == "With memory": | 
					
						
						|  | print("With memory") | 
					
						
						|  | response_of_llm = llm_chain_memory.predict(question=question) | 
					
						
						|  | else: | 
					
						
						|  | print("Without memory") | 
					
						
						|  | query = question | 
					
						
						|  | final_prompt = ChatPromptTemplate.from_messages( | 
					
						
						|  | [ | 
					
						
						|  | ("system", "You are a financial ai assitant "), | 
					
						
						|  | few_shot_prompt, | 
					
						
						|  | ("human", "{userInput}"), | 
					
						
						|  | ] | 
					
						
						|  | ) | 
					
						
						|  | messages = final_prompt.format(userInput=query) | 
					
						
						|  |  | 
					
						
						|  | ai_out = llm_chain_model.invoke(messages) | 
					
						
						|  | response_of_llm = ai_out['text'] | 
					
						
						|  |  | 
					
						
						|  | print(f"out is: {response_of_llm}") | 
					
						
						|  | return response_of_llm | 
					
						
						|  |  | 
					
						
						|  | demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown( | 
					
						
						|  | ["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong" | 
					
						
						|  | ),], outputs="text") | 
					
						
						|  | demo.launch(debug=True, share=True) | 
					
						
						|  |  |