Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,8 @@ from datasets import load_dataset
|
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
import nltk
|
11 |
from langchain.docstore.document import Document
|
|
|
|
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
@@ -35,7 +37,6 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
35 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
36 |
embedding_model.client.to(device)
|
37 |
|
38 |
-
# Chunking function
|
39 |
def chunk_documents_semantic(documents, max_chunk_size=500):
|
40 |
chunks = []
|
41 |
for doc in documents:
|
@@ -64,27 +65,41 @@ def chunk_documents_semantic(documents, max_chunk_size=500):
|
|
64 |
chunks.append(current_chunk.strip())
|
65 |
return chunks
|
66 |
|
67 |
-
# Process documents
|
|
|
68 |
documents = []
|
69 |
-
|
|
|
|
|
70 |
for split in ragbench[dataset_name].keys():
|
71 |
original_documents = ragbench[dataset_name][split]['documents']
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
-
#
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
persist_directory='./docs/chroma/'
|
80 |
)
|
81 |
-
vectordb.persist()
|
82 |
|
83 |
def process_query(query, dataset_choice):
|
84 |
try:
|
85 |
logger.info(f"Processing query for {dataset_choice}: {query}")
|
86 |
|
87 |
-
relevant_docs =
|
88 |
query,
|
89 |
k=5,
|
90 |
fetch_k=10
|
|
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
import nltk
|
11 |
from langchain.docstore.document import Document
|
12 |
+
from tqdm import tqdm
|
13 |
+
import os
|
14 |
|
15 |
# Set up logging
|
16 |
logging.basicConfig(level=logging.INFO)
|
|
|
37 |
embedding_model = HuggingFaceEmbeddings(model_name=model_name)
|
38 |
embedding_model.client.to(device)
|
39 |
|
|
|
40 |
def chunk_documents_semantic(documents, max_chunk_size=500):
|
41 |
chunks = []
|
42 |
for doc in documents:
|
|
|
65 |
chunks.append(current_chunk.strip())
|
66 |
return chunks
|
67 |
|
68 |
+
# Process documents in batches
|
69 |
+
batch_size = 1000
|
70 |
documents = []
|
71 |
+
total_processed = 0
|
72 |
+
|
73 |
+
for dataset_name in tqdm(ragbench.keys(), desc="Processing datasets"):
|
74 |
for split in ragbench[dataset_name].keys():
|
75 |
original_documents = ragbench[dataset_name][split]['documents']
|
76 |
+
|
77 |
+
for i in range(0, len(original_documents), batch_size):
|
78 |
+
batch = original_documents[i:i + batch_size]
|
79 |
+
chunked_documents = chunk_documents_semantic(batch)
|
80 |
+
documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
|
81 |
+
|
82 |
+
if len(documents) >= batch_size:
|
83 |
+
vectordb = Chroma.from_documents(
|
84 |
+
documents=documents,
|
85 |
+
embedding=embedding_model,
|
86 |
+
persist_directory=f'./docs/chroma_{total_processed}'
|
87 |
+
)
|
88 |
+
vectordb.persist()
|
89 |
+
total_processed += len(documents)
|
90 |
+
documents = []
|
91 |
|
92 |
+
# Final vector store
|
93 |
+
final_vectordb = Chroma(
|
94 |
+
persist_directory='./docs/chroma_final/',
|
95 |
+
embedding_function=embedding_model
|
|
|
96 |
)
|
|
|
97 |
|
98 |
def process_query(query, dataset_choice):
|
99 |
try:
|
100 |
logger.info(f"Processing query for {dataset_choice}: {query}")
|
101 |
|
102 |
+
relevant_docs = final_vectordb.max_marginal_relevance_search(
|
103 |
query,
|
104 |
k=5,
|
105 |
fetch_k=10
|