ArturG9 commited on
Commit
ad5fff5
·
verified ·
1 Parent(s): fa8b80f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -13
app.py CHANGED
@@ -12,15 +12,11 @@ from langchain.chains.combine_documents import create_stuff_documents_chain
12
  from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
13
  from langchain.prompts import PromptTemplate
14
  from langchain.vectorstores import Chroma
15
- from utills import load_txt_documents, split_docs, load_uploaded_documents, retriever_from_chroma
16
  from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
17
  from langchain_community.document_loaders.directory import DirectoryLoader
18
- from HTML_templates import css, bot_template, user_template
19
  from langchain_core.output_parsers import StrOutputParser
20
  from langchain_core.runnables import RunnablePassthrough
21
- from langchain import hub
22
- from langchain.retrievers import ContextualCompressionRetriever
23
- from langchain.retrievers.document_compressors import LLMChainExtractor
24
 
25
  lang_api_key = os.getenv("lang_api_key")
26
 
@@ -53,7 +49,6 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
53
  vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
54
 
55
  else:
56
- # Load documents from the specified data path
57
  st.write("Vector store doesnt exist and will be created now")
58
  loader = DirectoryLoader('./data/', glob="./*.txt", loader_cls=TextLoader)
59
  docs = loader.load()
@@ -61,13 +56,12 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
61
 
62
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
63
  chunk_size=chunk_size, chunk_overlap=chunk_overlap,
64
- separators=["\n \n \n", "\n \n", "\n1" , "(?<=\. )", " ", ""]
 
65
  )
66
  split_docs = text_splitter.split_documents(docs)
67
 
68
 
69
-
70
- # Create the vectorstore
71
  vectorstore = Chroma.from_documents(
72
  documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
73
  )
@@ -97,7 +91,7 @@ def main():
97
  st.header("Chat with multiple Lithuanian Law Documents:" ":books:")
98
 
99
  st.markdown("Hi, I am Birute (Powered by qwen2-0_5b model), chat assistant, based on republic of Lithuania law documents. You can choose below information retrieval type and how many documents you want to be retrieved.")
100
- st.markdown("Available Documents: LR_Civil_Code_2022, LR_Constitution_2022, LR_Criminal_Code_2018, LR_Criminal_Procedure_code_2022,LR_Labour_code_2010. P.S it's a shame that there are no newest documents translations... ")
101
 
102
  if "messages" not in st.session_state:
103
  st.session_state["messages"] = [
@@ -106,9 +100,9 @@ def main():
106
 
107
 
108
  search_type = st.selectbox(
109
- "Choose search type. Options are [Max marginal relevance search (similarity) , Similarity search (similarity). Default value (mmr)]",
110
  options=["mmr", "similarity"],
111
- index=1 # Default to "mmr"
112
  )
113
 
114
  k = st.select_slider(
@@ -116,7 +110,7 @@ def main():
116
  options=list(range(2, 16)),
117
  value=4
118
  )
119
- retriever = create_retriever_from_chroma(vectorstore_path="docs/chroma/", search_type=search_type, k=k, chunk_size=200, chunk_overlap=30)
120
 
121
 
122
 
 
12
  from langchain_community.chat_message_histories.streamlit import StreamlitChatMessageHistory
13
  from langchain.prompts import PromptTemplate
14
  from langchain.vectorstores import Chroma
 
15
  from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
16
  from langchain_community.document_loaders.directory import DirectoryLoader
 
17
  from langchain_core.output_parsers import StrOutputParser
18
  from langchain_core.runnables import RunnablePassthrough
19
+
 
 
20
 
21
  lang_api_key = os.getenv("lang_api_key")
22
 
 
49
  vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
50
 
51
  else:
 
52
  st.write("Vector store doesnt exist and will be created now")
53
  loader = DirectoryLoader('./data/', glob="./*.txt", loader_cls=TextLoader)
54
  docs = loader.load()
 
56
 
57
  text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
58
  chunk_size=chunk_size, chunk_overlap=chunk_overlap,
59
+ separators=["\n\n \n\n","\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
60
+ is_separator_regex = True
61
  )
62
  split_docs = text_splitter.split_documents(docs)
63
 
64
 
 
 
65
  vectorstore = Chroma.from_documents(
66
  documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
67
  )
 
91
  st.header("Chat with multiple Lithuanian Law Documents:" ":books:")
92
 
93
  st.markdown("Hi, I am Birute (Powered by qwen2-0_5b model), chat assistant, based on republic of Lithuania law documents. You can choose below information retrieval type and how many documents you want to be retrieved.")
94
+ st.markdown("Available Documents: LR_Civil_Code_2022, LR_Constitution_2022, LR_Criminal_Code_2018, LR_Criminal_Procedure_code_2022,LR_Labour_code_2010. P.S it's a shame that there are no newest documents translations into English... ")
95
 
96
  if "messages" not in st.session_state:
97
  st.session_state["messages"] = [
 
100
 
101
 
102
  search_type = st.selectbox(
103
+ "Choose search type. Options are [Max marginal relevance search (similarity) , Similarity search (similarity). Default value (similarity)]",
104
  options=["mmr", "similarity"],
105
+ index=1
106
  )
107
 
108
  k = st.select_slider(
 
110
  options=list(range(2, 16)),
111
  value=4
112
  )
113
+ retriever = create_retriever_from_chroma(vectorstore_path="docs/chroma/", search_type=search_type, k=k, chunk_size=350, chunk_overlap=30)
114
 
115
 
116