Update app.py
Browse files
app.py
CHANGED
@@ -12,15 +12,11 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
|
|
12 |
from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain.vectorstores import Chroma
|
15 |
-
from utills import load_txt_documents, split_docs, load_uploaded_documents, retriever_from_chroma
|
16 |
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
|
17 |
from langchain_community.document_loaders.directory import DirectoryLoader
|
18 |
-
from HTML_templates import css, bot_template, user_template
|
19 |
from langchain_core.output_parsers import StrOutputParser
|
20 |
from langchain_core.runnables import RunnablePassthrough
|
21 |
-
|
22 |
-
from langchain.retrievers import ContextualCompressionRetriever
|
23 |
-
from langchain.retrievers.document_compressors import LLMChainExtractor
|
24 |
|
25 |
lang_api_key = os.getenv("lang_api_key")
|
26 |
|
@@ -53,7 +49,6 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
53 |
vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
|
54 |
|
55 |
else:
|
56 |
-
# Load documents from the specified data path
|
57 |
st.write("Vector store doesnt exist and will be created now")
|
58 |
loader = DirectoryLoader('./data/', glob="./*.txt", loader_cls=TextLoader)
|
59 |
docs = loader.load()
|
@@ -61,13 +56,12 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
61 |
|
62 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
63 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
64 |
-
separators=["\n
|
|
|
65 |
)
|
66 |
split_docs = text_splitter.split_documents(docs)
|
67 |
|
68 |
|
69 |
-
|
70 |
-
# Create the vectorstore
|
71 |
vectorstore = Chroma.from_documents(
|
72 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
73 |
)
|
@@ -97,7 +91,7 @@ def main():
|
|
97 |
st.header("Chat with multiple Lithuanian Law Documents:" ":books:")
|
98 |
|
99 |
st.markdown("Hi, I am Birute (Powered by qwen2-0_5b model), chat assistant, based on republic of Lithuania law documents. You can choose below information retrieval type and how many documents you want to be retrieved.")
|
100 |
-
st.markdown("Available Documents: LR_Civil_Code_2022, LR_Constitution_2022, LR_Criminal_Code_2018, LR_Criminal_Procedure_code_2022,LR_Labour_code_2010. P.S it's a shame that there are no newest documents translations... ")
|
101 |
|
102 |
if "messages" not in st.session_state:
|
103 |
st.session_state["messages"] = [
|
@@ -106,9 +100,9 @@ def main():
|
|
106 |
|
107 |
|
108 |
search_type = st.selectbox(
|
109 |
-
"Choose search type. Options are [Max marginal relevance search (similarity) , Similarity search (similarity). Default value (
|
110 |
options=["mmr", "similarity"],
|
111 |
-
index=1
|
112 |
)
|
113 |
|
114 |
k = st.select_slider(
|
@@ -116,7 +110,7 @@ def main():
|
|
116 |
options=list(range(2, 16)),
|
117 |
value=4
|
118 |
)
|
119 |
-
retriever = create_retriever_from_chroma(vectorstore_path="docs/chroma/", search_type=search_type, k=k, chunk_size=
|
120 |
|
121 |
|
122 |
|
|
|
12 |
from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
|
13 |
from langchain.prompts import PromptTemplate
|
14 |
from langchain.vectorstores import Chroma
|
|
|
15 |
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
|
16 |
from langchain_community.document_loaders.directory import DirectoryLoader
|
|
|
17 |
from langchain_core.output_parsers import StrOutputParser
|
18 |
from langchain_core.runnables import RunnablePassthrough
|
19 |
+
|
|
|
|
|
20 |
|
21 |
lang_api_key = os.getenv("lang_api_key")
|
22 |
|
|
|
49 |
vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
|
50 |
|
51 |
else:
|
|
|
52 |
st.write("Vector store doesnt exist and will be created now")
|
53 |
loader = DirectoryLoader('./data/', glob="./*.txt", loader_cls=TextLoader)
|
54 |
docs = loader.load()
|
|
|
56 |
|
57 |
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
58 |
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
59 |
+
separators=["\n\n \n\n","\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
|
60 |
+
is_separator_regex = True
|
61 |
)
|
62 |
split_docs = text_splitter.split_documents(docs)
|
63 |
|
64 |
|
|
|
|
|
65 |
vectorstore = Chroma.from_documents(
|
66 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
67 |
)
|
|
|
91 |
st.header("Chat with multiple Lithuanian Law Documents:" ":books:")
|
92 |
|
93 |
st.markdown("Hi, I am Birute (Powered by qwen2-0_5b model), chat assistant, based on republic of Lithuania law documents. You can choose below information retrieval type and how many documents you want to be retrieved.")
|
94 |
+
st.markdown("Available Documents: LR_Civil_Code_2022, LR_Constitution_2022, LR_Criminal_Code_2018, LR_Criminal_Procedure_code_2022,LR_Labour_code_2010. P.S it's a shame that there are no newest documents translations into English... ")
|
95 |
|
96 |
if "messages" not in st.session_state:
|
97 |
st.session_state["messages"] = [
|
|
|
100 |
|
101 |
|
102 |
search_type = st.selectbox(
|
103 |
+
"Choose search type. Options are [Max marginal relevance search (similarity) , Similarity search (similarity). Default value (similarity)]",
|
104 |
options=["mmr", "similarity"],
|
105 |
+
index=1
|
106 |
)
|
107 |
|
108 |
k = st.select_slider(
|
|
|
110 |
options=list(range(2, 16)),
|
111 |
value=4
|
112 |
)
|
113 |
+
retriever = create_retriever_from_chroma(vectorstore_path="docs/chroma/", search_type=search_type, k=k, chunk_size=350, chunk_overlap=30)
|
114 |
|
115 |
|
116 |
|