Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
3431b72
1
Parent(s):
77e7c34
Update better separator for get_doc_chunks
Browse files
app.py
CHANGED
@@ -85,9 +85,8 @@ from langchain.embeddings import (
|
|
85 |
from langchain.llms import HuggingFacePipeline, OpenAI
|
86 |
from langchain.memory import ConversationBufferMemory
|
87 |
from langchain.text_splitter import (
|
88 |
-
# CharacterTextSplitter,
|
89 |
RecursiveCharacterTextSplitter,
|
90 |
-
)
|
91 |
from langchain.vectorstores import FAISS, Chroma
|
92 |
from loguru import logger
|
93 |
from PyPDF2 import PdfReader
|
@@ -134,6 +133,7 @@ CHROMA_SETTINGS = Settings(
|
|
134 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
135 |
|
136 |
MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
|
|
|
137 |
|
138 |
# opanai max 4097
|
139 |
# retriever default k = 4, query lenght about CHUNK_SIZE
|
@@ -233,20 +233,25 @@ def get_pdf_text(pdf_docs):
|
|
233 |
|
234 |
|
235 |
# def get_text_chunks(text, chunk_size=None, chunk_overlap=None):
|
236 |
-
def get_doc_chunks(
|
|
|
|
|
237 |
"""Generate doc chunks."""
|
238 |
if chunk_size is None:
|
239 |
chunk_size = ns.chunk_size
|
240 |
if chunk_overlap is None:
|
241 |
chunk_overlap = ns.chunk_overlap
|
|
|
|
|
|
|
242 |
|
243 |
# text_splitter = CharacterTextSplitter(
|
244 |
text_splitter = RecursiveCharacterTextSplitter(
|
245 |
# separator="\n",
|
246 |
-
separators=
|
247 |
chunk_size=chunk_size,
|
248 |
chunk_overlap=chunk_overlap,
|
249 |
-
length_function=len
|
250 |
)
|
251 |
# chunks = text_splitter.split_text(text)
|
252 |
chunks = text_splitter.split_documents(doc)
|
@@ -260,7 +265,7 @@ def get_vectorstore(
|
|
260 |
vectorstore=None,
|
261 |
model_name=None,
|
262 |
persist=True,
|
263 |
-
persist_directory=None
|
264 |
):
|
265 |
"""Gne vectorstore."""
|
266 |
# embedding = OpenAIEmbeddings()
|
@@ -300,7 +305,9 @@ def get_vectorstore(
|
|
300 |
)
|
301 |
else:
|
302 |
# vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
|
303 |
-
vectorstore = Chroma.from_documents(
|
|
|
|
|
304 |
|
305 |
logger.info(
|
306 |
# "Done vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
|
@@ -454,7 +461,8 @@ def embed_files(progress=gr.Progress()):
|
|
454 |
|
455 |
# ns.qa = load_qa()
|
456 |
|
457 |
-
|
|
|
458 |
retriever = ns.db.as_retriever()
|
459 |
ns.qa = RetrievalQA.from_chain_type(
|
460 |
llm=llm,
|
@@ -690,6 +698,7 @@ def load_qa(device=None, model_name: str = MODEL_NAME):
|
|
690 |
# _ = """
|
691 |
# llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
|
692 |
|
|
|
693 |
llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
|
694 |
qa = RetrievalQA.from_chain_type(
|
695 |
llm=llm,
|
|
|
85 |
from langchain.llms import HuggingFacePipeline, OpenAI
|
86 |
from langchain.memory import ConversationBufferMemory
|
87 |
from langchain.text_splitter import (
|
|
|
88 |
RecursiveCharacterTextSplitter,
|
89 |
+
) # CharacterTextSplitter,
|
90 |
from langchain.vectorstores import FAISS, Chroma
|
91 |
from loguru import logger
|
92 |
from PyPDF2 import PdfReader
|
|
|
133 |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
134 |
|
135 |
MODEL_NAME = "paraphrase-multilingual-mpnet-base-v2" # 1.11G
|
136 |
+
# 'max_seq_length': 128
|
137 |
|
138 |
# opanai max 4097
|
139 |
# retriever default k = 4, query lenght about CHUNK_SIZE
|
|
|
233 |
|
234 |
|
235 |
# def get_text_chunks(text, chunk_size=None, chunk_overlap=None):
|
236 |
+
def get_doc_chunks(
|
237 |
+
doc: List[Document], chunk_size=None, chunk_overlap=None, separators=None
|
238 |
+
) -> List[Document]:
|
239 |
"""Generate doc chunks."""
|
240 |
if chunk_size is None:
|
241 |
chunk_size = ns.chunk_size
|
242 |
if chunk_overlap is None:
|
243 |
chunk_overlap = ns.chunk_overlap
|
244 |
+
if separators is None:
|
245 |
+
# \u3000 is a space
|
246 |
+
separators = ["\n\n"] + list("\n。.!!??”】],, \u3000") + [""]
|
247 |
|
248 |
# text_splitter = CharacterTextSplitter(
|
249 |
text_splitter = RecursiveCharacterTextSplitter(
|
250 |
# separator="\n",
|
251 |
+
separators=separators,
|
252 |
chunk_size=chunk_size,
|
253 |
chunk_overlap=chunk_overlap,
|
254 |
+
length_function=len,
|
255 |
)
|
256 |
# chunks = text_splitter.split_text(text)
|
257 |
chunks = text_splitter.split_documents(doc)
|
|
|
265 |
vectorstore=None,
|
266 |
model_name=None,
|
267 |
persist=True,
|
268 |
+
persist_directory=None,
|
269 |
):
|
270 |
"""Gne vectorstore."""
|
271 |
# embedding = OpenAIEmbeddings()
|
|
|
305 |
)
|
306 |
else:
|
307 |
# vectorstore = Chroma.from_texts(texts=text_chunks, embedding=embedding)
|
308 |
+
vectorstore = Chroma.from_documents(
|
309 |
+
documents=doc_chunks, embedding=embedding
|
310 |
+
)
|
311 |
|
312 |
logger.info(
|
313 |
# "Done vectorstore Chroma.from_texts(texts=text_chunks, embedding=embedding)"
|
|
|
461 |
|
462 |
# ns.qa = load_qa()
|
463 |
|
464 |
+
# client=None to make pyright happy
|
465 |
+
llm = OpenAI(temperature=0, max_tokens=1024, client=None)
|
466 |
retriever = ns.db.as_retriever()
|
467 |
ns.qa = RetrievalQA.from_chain_type(
|
468 |
llm=llm,
|
|
|
698 |
# _ = """
|
699 |
# llm = gen_local_llm() # "TheBloke/vicuna-7B-1.1-HF" 12G?
|
700 |
|
701 |
+
# model=gpt-3.5-turbo-16k
|
702 |
llm = OpenAI(temperature=0, max_tokens=1024) # type: ignore
|
703 |
qa = RetrievalQA.from_chain_type(
|
704 |
llm=llm,
|