Spaces:

Danielrahmai1991
/

findemov3.4

Paused

App Files Files Community

findemov3.4 / app.py

Danielrahmai1991

Update app.py

4173e6d verified 10 months ago

raw

history blame contribute delete

4.64 kB

	import gradio as gr

	from langchain_community.llms import LlamaCpp
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from langchain_core.callbacks import StreamingStdOutCallbackHandler
	from langchain.retrievers import TFIDFRetriever
	from langchain.chains import RetrievalQA
	from langchain.memory import ConversationBufferMemory

	from unsloth import FastLanguageModel
	import torch
	max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
	dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
	load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "Danielrahmai1991/finbro-v0.1.0-llama-3-8B-instruct-1m",
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
	)

	from langchain_huggingface.llms import HuggingFacePipeline
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	FastLanguageModel.for_inference(model)

	pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
	from langchain_community.llms import HuggingFaceEndpoint


	# gpu_llm = HuggingFacePipeline(
	# pipeline=pipe,
	# batch_size=5, # adjust as needed based on GPU map and model size.
	# model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

	# )
	gpu_llm = HuggingFacePipeline(
	pipeline=pipe,
	batch_size=5, # adjust as needed based on GPU map and model size.
	model_kwargs={"temperature": 0.75, "max_length": 512, "max_new_tokens": 256, "repetition_penalty": 1.15, "trust_remote_code": True},

	)
	from langchain_core.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from langchain.schema import HumanMessage, SystemMessage, AIMessage

	alpaca_prompt_simple = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{question}

	### Input:


	### Response:
	"""

	prompt = PromptTemplate.from_template(alpaca_prompt_simple)
	llm_chain_model = LLMChain(prompt=prompt, llm=gpu_llm.bind(skip_prompt=True))


	from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate



	examples = [
	{
	"query": "what is forex?",
	"answer": "Forex is an abbreviation for foreign exchange. It involves trading currencies from different countries with one another at the current market price."
	},
	]
	example_prompt = ChatPromptTemplate.from_messages(
	[
	("human", "{query}"),
	("ai", "{answer}"),
	]
	)


	few_shot_prompt = FewShotChatMessagePromptTemplate(
	example_prompt=example_prompt,
	examples=examples,
	)




	# with memory
	from langchain_core.prompts import PromptTemplate
	from langchain.memory import ConversationBufferMemory

	alpaca_prompt_memory = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
	{chat_history}

	### Instruction:

	{question}



	### Input:

	### Response:
	"""

	prompt = PromptTemplate(
	input_variables=["chat_history", "question"], template=alpaca_prompt_memory
	)
	memory = ConversationBufferMemory(memory_key="chat_history")

	llm_chain_memory = LLMChain(
	llm=gpu_llm.bind(skip_prompt=True),
	prompt=prompt,
	verbose=True,
	memory=memory,
	)

	# question = "give me suggestion about inevstment"

	def greet(question, model_type):
	print(f"question is {question}")
	if model_type == "With memory":
	print("With memory")
	response_of_llm = llm_chain_memory.predict(question=question)
	else:
	print("Without memory")
	query = question
	final_prompt = ChatPromptTemplate.from_messages(
	[
	("system", "You are a financial ai assitant "),
	few_shot_prompt,
	("human", "{userInput}"),
	]
	)
	messages = final_prompt.format(userInput=query)

	ai_out = llm_chain_model.invoke(messages)
	response_of_llm = ai_out['text']

	print(f"out is: {response_of_llm}")
	return response_of_llm

	demo = gr.Interface(fn=greet, inputs=["text", gr.Dropdown(
	["With memory", "Without memory"], label="Memory status", info="With using memory, the output will be slow but strong"
	),], outputs="text")
	demo.launch(debug=True, share=True)