ajalisatgi commited on
Commit
636e240
·
verified ·
1 Parent(s): 9ead98b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -12
app.py CHANGED
@@ -9,6 +9,8 @@ from datasets import load_dataset
9
  from nltk.tokenize import sent_tokenize
10
  import nltk
11
  from langchain.docstore.document import Document
 
 
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
@@ -35,7 +37,6 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
36
  embedding_model.client.to(device)
37
 
38
- # Chunking function
39
  def chunk_documents_semantic(documents, max_chunk_size=500):
40
  chunks = []
41
  for doc in documents:
@@ -64,27 +65,41 @@ def chunk_documents_semantic(documents, max_chunk_size=500):
64
  chunks.append(current_chunk.strip())
65
  return chunks
66
 
67
- # Process documents and create vectordb
 
68
  documents = []
69
- for dataset_name in ragbench.keys():
 
 
70
  for split in ragbench[dataset_name].keys():
71
  original_documents = ragbench[dataset_name][split]['documents']
72
- chunked_documents = chunk_documents_semantic(original_documents)
73
- documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- # Initialize vectordb with processed documents
76
- vectordb = Chroma.from_documents(
77
- documents=documents,
78
- embedding=embedding_model,
79
- persist_directory='./docs/chroma/'
80
  )
81
- vectordb.persist()
82
 
83
  def process_query(query, dataset_choice):
84
  try:
85
  logger.info(f"Processing query for {dataset_choice}: {query}")
86
 
87
- relevant_docs = vectordb.max_marginal_relevance_search(
88
  query,
89
  k=5,
90
  fetch_k=10
 
9
  from nltk.tokenize import sent_tokenize
10
  import nltk
11
  from langchain.docstore.document import Document
12
+ from tqdm import tqdm
13
+ import os
14
 
15
  # Set up logging
16
  logging.basicConfig(level=logging.INFO)
 
37
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
38
  embedding_model.client.to(device)
39
 
 
40
  def chunk_documents_semantic(documents, max_chunk_size=500):
41
  chunks = []
42
  for doc in documents:
 
65
  chunks.append(current_chunk.strip())
66
  return chunks
67
 
68
+ # Process documents in batches
69
+ batch_size = 1000
70
  documents = []
71
+ total_processed = 0
72
+
73
+ for dataset_name in tqdm(ragbench.keys(), desc="Processing datasets"):
74
  for split in ragbench[dataset_name].keys():
75
  original_documents = ragbench[dataset_name][split]['documents']
76
+
77
+ for i in range(0, len(original_documents), batch_size):
78
+ batch = original_documents[i:i + batch_size]
79
+ chunked_documents = chunk_documents_semantic(batch)
80
+ documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
81
+
82
+ if len(documents) >= batch_size:
83
+ vectordb = Chroma.from_documents(
84
+ documents=documents,
85
+ embedding=embedding_model,
86
+ persist_directory=f'./docs/chroma_{total_processed}'
87
+ )
88
+ vectordb.persist()
89
+ total_processed += len(documents)
90
+ documents = []
91
 
92
+ # Final vector store
93
+ final_vectordb = Chroma(
94
+ persist_directory='./docs/chroma_final/',
95
+ embedding_function=embedding_model
 
96
  )
 
97
 
98
  def process_query(query, dataset_choice):
99
  try:
100
  logger.info(f"Processing query for {dataset_choice}: {query}")
101
 
102
+ relevant_docs = final_vectordb.max_marginal_relevance_search(
103
  query,
104
  k=5,
105
  fetch_k=10