Spaces:

ajalisatgi
/

Gradio

Sleeping

App Files Files Community

ajalisatgi commited on Feb 18

Commit

73ab43d

verified ·

1 Parent(s): 636e240

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -35

app.py CHANGED Viewed

@@ -16,22 +16,31 @@ import os
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Download all required NLTK data upfront
 nltk.download('punkt')
 nltk.download('punkt_tab')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
 # Initialize OpenAI API key
-openai.api_key = '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'  # Replace with your API key
-# Load the ragbench datasets
 ragbench = {}
-for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
-    ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
-    logger.info(f"Loaded {dataset}")
-# Initialize with a stronger model
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
@@ -65,41 +74,28 @@ def chunk_documents_semantic(documents, max_chunk_size=500):
                 chunks.append(current_chunk.strip())
     return chunks
-# Process documents in batches
-batch_size = 1000
 documents = []
-total_processed = 0
-for dataset_name in tqdm(ragbench.keys(), desc="Processing datasets"):
-    for split in ragbench[dataset_name].keys():
-        original_documents = ragbench[dataset_name][split]['documents']
-        for i in range(0, len(original_documents), batch_size):
-            batch = original_documents[i:i + batch_size]
-            chunked_documents = chunk_documents_semantic(batch)
-            documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
-            if len(documents) >= batch_size:
-                vectordb = Chroma.from_documents(
-                    documents=documents,
-                    embedding=embedding_model,
-                    persist_directory=f'./docs/chroma_{total_processed}'
-                )
-                vectordb.persist()
-                total_processed += len(documents)
-                documents = []
-# Final vector store
-final_vectordb = Chroma(
-    persist_directory='./docs/chroma_final/',
-    embedding_function=embedding_model
 )
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
-        relevant_docs = final_vectordb.max_marginal_relevance_search(
             query,
             k=5,
             fetch_k=10
@@ -123,7 +119,7 @@ def process_query(query, dataset_choice):
         logger.error(f"Error processing query: {str(e)}")
         return f"Error: {str(e)}"
-# Create Gradio interface with dataset selection
 demo = gr.Interface(
     fn=process_query,
     inputs=[

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Download NLTK data
 nltk.download('punkt')
 nltk.download('punkt_tab')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('stopwords')
 # Initialize OpenAI API key
+openai.api_key = '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'
+# Load selected datasets
+logger.info("Starting dataset loading...")
 ragbench = {}
+datasets_to_load = ['covidqa', 'hotpotqa', 'pubmedqa']
+for dataset in datasets_to_load:
+    try:
+        ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset, split='train')
+        logger.info(f"Successfully loaded {dataset}")
+    except Exception as e:
+        logger.error(f"Failed to load {dataset}: {e}")
+        continue
+print(f"Loaded {len(ragbench)} datasets successfully")
+# Initialize embedding model
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
                 chunks.append(current_chunk.strip())
     return chunks
+# Process documents
 documents = []
+for dataset_name, dataset in ragbench.items():
+    logger.info(f"Processing {dataset_name}")
+    original_documents = dataset['documents']
+    chunked_documents = chunk_documents_semantic(original_documents)
+    documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
+    logger.info(f"Processed {len(chunked_documents)} chunks from {dataset_name}")
+# Initialize vectordb
+vectordb = Chroma.from_documents(
+    documents=documents,
+    embedding=embedding_model,
+    persist_directory='./docs/chroma/'
 )
+vectordb.persist()
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
+        relevant_docs = vectordb.max_marginal_relevance_search(
             query,
             k=5,
             fetch_k=10
         logger.error(f"Error processing query: {str(e)}")
         return f"Error: {str(e)}"
+# Create Gradio interface
 demo = gr.Interface(
     fn=process_query,
     inputs=[