ffreemt commited on
Commit
3431b72
·
1 Parent(s): 77e7c34

Update better separator for get_doc_chunks

Browse files
Files changed (1) hide show
  1. app.py +17 -8
app.py CHANGED
@@ -85,9 +85,8 @@ from langchain.embeddings import (
85
  from langchain.llms import HuggingFacePipeline, OpenAI
86
  from langchain.memory import ConversationBufferMemory
87
  from langchain.text_splitter import (
88
- # CharacterTextSplitter,
89
  RecursiveCharacterTextSplitter,
90
- )
91
  from langchain.vectorstores import FAISS, Chroma
92
  from loguru import logger
93
  from PyPDF2 import PdfReader
@@ -134,6 +133,7 @@ CHROMA_SETTINGS = Settings(
134
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
135
 
136
  MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
 
137
 
138
  # opanai max 4097
139
  # retriever default k = 4, query lenght about CHUNK_SIZE
@@ -233,20 +233,25 @@ def get_pdf_text(pdf_docs):
233
 
234
 
235
  # def get_text_chunks(text, chunk_size=None, chunk_overlap=None):
236
- def get_doc_chunks(doc: Document, chunk_size=None, chunk_overlap=None) -> List[Document]:
 
 
237
  """Generate doc chunks."""
238
  if chunk_size is None:
239
  chunk_size = ns.chunk_size
240
  if chunk_overlap is None:
241
  chunk_overlap = ns.chunk_overlap
 
 
 
242
 
243
  # text_splitter = CharacterTextSplitter(
244
  text_splitter = RecursiveCharacterTextSplitter(
245
  # separator="\n",
246
- separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
247
  chunk_size=chunk_size,
248
  chunk_overlap=chunk_overlap,
249
- length_function=len
250
  )
251
  # chunks = text_splitter.split_text(text)
252
  chunks = text_splitter.split_documents(doc)
@@ -260,7 +265,7 @@ def get_vectorstore(
260
  vectorstore=None,
261
  model_name=None,
262
  persist=True,
263
- persist_directory=None
264
  ):
265
  """Gne vectorstore."""
266
  # embedding = OpenAIEmbeddings()
@@ -300,7 +305,9 @@ def get_vectorstore(
300
  )
301
  else:
302
  # vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
303
- vectorstore = Chroma.from_documents(documents=doc_chunks, embedding=embedding)
 
 
304
 
305
  logger.info(
306
  # "Done vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
@@ -454,7 +461,8 @@ def embed_files(progress=gr.Progress()):
454
 
455
  # ns.qa = load_qa()
456
 
457
- llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
 
458
  retriever = ns.db.as_retriever()
459
  ns.qa = RetrievalQA.from_chain_type(
460
  llm=llm,
@@ -690,6 +698,7 @@ def load_qa(device=None, model_name: str = MODEL_NAME):
690
  # _ = """
691
  # llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
692
 
 
693
  llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
694
  qa = RetrievalQA.from_chain_type(
695
  llm=llm,
 
85
  from langchain.llms import HuggingFacePipeline, OpenAI
86
  from langchain.memory import ConversationBufferMemory
87
  from langchain.text_splitter import (
 
88
  RecursiveCharacterTextSplitter,
89
+ ) # CharacterTextSplitter,
90
  from langchain.vectorstores import FAISS, Chroma
91
  from loguru import logger
92
  from PyPDF2 import PdfReader
 
133
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
134
 
135
  MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
136
+ # 'max_seq_length': 128
137
 
138
  # opanai max 4097
139
  # retriever default k = 4, query lenght about CHUNK_SIZE
 
233
 
234
 
235
  # def get_text_chunks(text, chunk_size=None, chunk_overlap=None):
236
+ def get_doc_chunks(
237
+ doc: List[Document], chunk_size=None, chunk_overlap=None, separators=None
238
+ ) -> List[Document]:
239
  """Generate doc chunks."""
240
  if chunk_size is None:
241
  chunk_size = ns.chunk_size
242
  if chunk_overlap is None:
243
  chunk_overlap = ns.chunk_overlap
244
+ if separators is None:
245
+ # \u3000 is a space
246
+ separators = ["\n\n"] + list("\n。.!!??”】],, \u3000") + [""]
247
 
248
  # text_splitter = CharacterTextSplitter(
249
  text_splitter = RecursiveCharacterTextSplitter(
250
  # separator="\n",
251
+ separators=separators,
252
  chunk_size=chunk_size,
253
  chunk_overlap=chunk_overlap,
254
+ length_function=len,
255
  )
256
  # chunks = text_splitter.split_text(text)
257
  chunks = text_splitter.split_documents(doc)
 
265
  vectorstore=None,
266
  model_name=None,
267
  persist=True,
268
+ persist_directory=None,
269
  ):
270
  """Gne vectorstore."""
271
  # embedding = OpenAIEmbeddings()
 
305
  )
306
  else:
307
  # vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
308
+ vectorstore = Chroma.from_documents(
309
+ documents=doc_chunks, embedding=embedding
310
+ )
311
 
312
  logger.info(
313
  # "Done vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
 
461
 
462
  # ns.qa = load_qa()
463
 
464
+ # client=None to make pyright happy
465
+ llm = OpenAI(temperature=0, max_tokens=1024, client=None)
466
  retriever = ns.db.as_retriever()
467
  ns.qa = RetrievalQA.from_chain_type(
468
  llm=llm,
 
698
  # _ = """
699
  # llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
700
 
701
+ # model=gpt-3.5-turbo-16k
702
  llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
703
  qa = RetrievalQA.from_chain_type(
704
  llm=llm,