Spaces:

ajalisatgi
/

Gradio

Sleeping

App Files Files Community

ajalisatgi commited on Feb 18

Commit

636e240

verified ·

1 Parent(s): 9ead98b

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -12

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ from datasets import load_dataset
 from nltk.tokenize import sent_tokenize
 import nltk
 from langchain.docstore.document import Document
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -35,7 +37,6 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
 embedding_model.client.to(device)
-# Chunking function
 def chunk_documents_semantic(documents, max_chunk_size=500):
     chunks = []
     for doc in documents:
@@ -64,27 +65,41 @@ def chunk_documents_semantic(documents, max_chunk_size=500):
                 chunks.append(current_chunk.strip())
     return chunks
-# Process documents and create vectordb
 documents = []
-for dataset_name in ragbench.keys():
     for split in ragbench[dataset_name].keys():
         original_documents = ragbench[dataset_name][split]['documents']
-        chunked_documents = chunk_documents_semantic(original_documents)
-        documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
-# Initialize vectordb with processed documents
-vectordb = Chroma.from_documents(
-    documents=documents,
-    embedding=embedding_model,
-    persist_directory='./docs/chroma/'
 )
-vectordb.persist()
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
-        relevant_docs = vectordb.max_marginal_relevance_search(
             query,
             k=5,
             fetch_k=10

 from nltk.tokenize import sent_tokenize
 import nltk
 from langchain.docstore.document import Document
+from tqdm import tqdm
+import os
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
 embedding_model.client.to(device)
 def chunk_documents_semantic(documents, max_chunk_size=500):
     chunks = []
     for doc in documents:
                 chunks.append(current_chunk.strip())
     return chunks
+# Process documents in batches
+batch_size = 1000
 documents = []
+total_processed = 0
+for dataset_name in tqdm(ragbench.keys(), desc="Processing datasets"):
     for split in ragbench[dataset_name].keys():
         original_documents = ragbench[dataset_name][split]['documents']
+        for i in range(0, len(original_documents), batch_size):
+            batch = original_documents[i:i + batch_size]
+            chunked_documents = chunk_documents_semantic(batch)
+            documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
+            if len(documents) >= batch_size:
+                vectordb = Chroma.from_documents(
+                    documents=documents,
+                    embedding=embedding_model,
+                    persist_directory=f'./docs/chroma_{total_processed}'
+                )
+                vectordb.persist()
+                total_processed += len(documents)
+                documents = []
+# Final vector store
+final_vectordb = Chroma(
+    persist_directory='./docs/chroma_final/',
+    embedding_function=embedding_model
 )
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
+        relevant_docs = final_vectordb.max_marginal_relevance_search(
             query,
             k=5,
             fetch_k=10