Spaces:
Sleeping
Sleeping
import tiktoken | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_chroma import Chroma | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.document_loaders import PyPDFLoader | |
from langchain.memory import ConversationSummaryBufferMemory | |
from langchain_groq import ChatGroq | |
import os | |
tokenizer = tiktoken.get_encoding('cl100k_base') | |
FILE_NAMEs = os.listdir('data') | |
# system_template = """ you are LIC Customer Service Chatbot. | |
# Use the following pieces of context to answer the user's question. | |
# If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
# ---------------- | |
# {context}""" | |
SYSTEM_PROMPT = """ | |
you are LIC Customer Service Chatbot. | |
Use the following pieces of context to answer the user's question. | |
If you don't know the answer, just say that you don't know, don't try to make up an answer. | |
context: {context} | |
previous message summary: {previous_message_summary} | |
""" | |
human_template = "{question}" | |
NLP_MODEL_NAME = "llama3-70b-8192" | |
REASONING_MODEL_NAME = "mixtral-8x7b-32768" | |
REASONING_MODEL_TEMPERATURE = 0 | |
NLP_MODEL_TEMPERATURE = 0 | |
NLP_MODEL_MAX_TOKENS = 5400 | |
VECTOR_MAX_TOKENS = 100 | |
VECTORS_TOKEN_OVERLAP_SIZE = 20 | |
NUMBER_OF_VECTORS_FOR_RAG = 7 | |
# create the length function | |
def tiktoken_len(text): | |
tokens = tokenizer.encode( | |
text, | |
disallowed_special=() | |
) | |
return len(tokens) | |
def get_vectorstore(): | |
model_name = "BAAI/bge-small-en" | |
model_kwargs = {"device": "cpu"} | |
encode_kwargs = {"normalize_embeddings": True} | |
hf = HuggingFaceBgeEmbeddings( | |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
) | |
all_splits = [] | |
for file_name in FILE_NAMEs: | |
if file_name.endswith(".pdf"): | |
loader = PyPDFLoader(os.path.join("data",file_name)) | |
data = loader.load()[0].page_content | |
else: | |
with open(os.path.join("data",file_name), "r") as f: | |
data = f.read() | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=VECTOR_MAX_TOKENS, | |
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, | |
length_function=tiktoken_len, | |
separators=["\n\n\n","\n\n", "\n", " ", ""] | |
) | |
all_splits = all_splits + text_splitter.split_text(data) | |
vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) | |
return vectorstore | |
chat = ChatGroq(temperature=0, groq_api_key="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", model_name="llama3-8b-8192", streaming=True) | |
rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000) | |
my_vector_store = get_vectorstore() |