Spaces:

bacancydataprophets
/

insurance_bot

Sleeping

App Files Files Community

insurance_bot / app_config.py

HarshSanghavi

Upload 6 files

b3f99e2 verified 12 months ago

raw

history blame

3.09 kB

	import tiktoken
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_chroma import Chroma
	from langchain_community.embeddings import HuggingFaceBgeEmbeddings
	from langchain.document_loaders import PyPDFLoader
	from langchain.memory import ConversationSummaryBufferMemory
	from langchain_groq import ChatGroq
	import os
	tokenizer = tiktoken.get_encoding('cl100k_base')
	FILE_NAMEs = os.listdir('data')

	# system_template = """ you are LIC Customer Service Chatbot.
	# Use the following pieces of context to answer the user's question.
	# If you don't know the answer, just say that you don't know, don't try to make up an answer.
	# ----------------
	# {context}"""


	SYSTEM_PROMPT = """
	You are an insurance policy expert bot. You have different policies which can be found in company list.
	Here is the list of companies providng this policies
	Your tasks when user asks question:
	1. Familiarize themselves with the policy terms and conditions.
	2. Clear any doubts they may have about the policy.
	3. Compare different policies provided by different companies.

	Your response should be clear, concise and within the given context. If needed you can give detail response. If you can't find the answer in context just say 'I don't know'. Do not try to make up answers by yourself.
	context: {context}
	previous message summary: {previous_message_summary}
	"""

	human_template = "{question}"

	NLP_MODEL_NAME = "llama3-70b-8192"
	REASONING_MODEL_NAME = "mixtral-8x7b-32768"
	REASONING_MODEL_TEMPERATURE = 0
	NLP_MODEL_TEMPERATURE = 0
	NLP_MODEL_MAX_TOKENS = 5400
	VECTOR_MAX_TOKENS = 100
	VECTORS_TOKEN_OVERLAP_SIZE = 20
	NUMBER_OF_VECTORS_FOR_RAG = 7



	# create the length function
	def tiktoken_len(text):
	tokens = tokenizer.encode(
	text,
	disallowed_special=()
	)
	return len(tokens)
	def get_vectorstore():
	model_name = "BAAI/bge-small-en"
	model_kwargs = {"device": "cpu"}
	encode_kwargs = {"normalize_embeddings": True}
	hf = HuggingFaceBgeEmbeddings(
	model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
	)
	all_splits = []
	for file_name in FILE_NAMEs:
	if file_name.endswith(".pdf"):
	loader = PyPDFLoader(os.path.join("data",file_name))
	data = loader.load()[0].page_content
	else:
	with open(os.path.join("data",file_name), "r") as f:
	data = f.read()
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=VECTOR_MAX_TOKENS,
	chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
	length_function=tiktoken_len,
	separators=["\n\n\n","\n\n", "\n", " ", ""]
	)
	all_splits = all_splits + text_splitter.split_text(data)

	vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
	return vectorstore


	chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192", streaming=True)
	rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)

	my_vector_store = get_vectorstore()