Spaces:

ajalisatgi
/

Gradio

Sleeping

App Files Files Community

ajalisatgi commited on Feb 18

Commit

4beb772

verified ·

1 Parent(s): 8dfd657

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -5

app.py CHANGED Viewed

@@ -8,38 +8,89 @@ import logging
 from datasets import load_dataset
 from nltk.tokenize import sent_tokenize
 import nltk
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Load the ragbench datasets
 ragbench = {}
 for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
     ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
     logger.info(f"Loaded {dataset}")
-# Initialize with a stronger model for better semantic understanding
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
 embedding_model.client.to(device)
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
-        # Get relevant documents specific to the chosen dataset
         relevant_docs = vectordb.max_marginal_relevance_search(
             query,
-            k=5,  # Top 5 most relevant documents
-            fetch_k=10  # Fetch top 10 then select most diverse 5
         )
         context = " ".join([doc.page_content for doc in relevant_docs])
         response = openai.chat.completions.create(
-            model="gpt-4",
             messages=[
                 {"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
                 {"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}

 from datasets import load_dataset
 from nltk.tokenize import sent_tokenize
 import nltk
+from langchain.docstore.document import Document
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize OpenAI API key
+openai.api_key = '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'  # Replace with your API key
+# Download NLTK data
+nltk.download('punkt')
 # Load the ragbench datasets
 ragbench = {}
 for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
     ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
     logger.info(f"Loaded {dataset}")
+# Initialize with a stronger model
 model_name = 'sentence-transformers/all-mpnet-base-v2'
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 embedding_model = HuggingFaceEmbeddings(model_name=model_name)
 embedding_model.client.to(device)
+# Chunking function
+def chunk_documents_semantic(documents, max_chunk_size=500):
+    chunks = []
+    for doc in documents:
+        if isinstance(doc, list):
+            for passage in doc:
+                sentences = sent_tokenize(passage)
+                current_chunk = ""
+                for sentence in sentences:
+                    if len(current_chunk) + len(sentence) <= max_chunk_size:
+                        current_chunk += sentence + " "
+                    else:
+                        chunks.append(current_chunk.strip())
+                        current_chunk = sentence + " "
+                if current_chunk:
+                    chunks.append(current_chunk.strip())
+        else:
+            sentences = sent_tokenize(doc)
+            current_chunk = ""
+            for sentence in sentences:
+                if len(current_chunk) + len(sentence) <= max_chunk_size:
+                    current_chunk += sentence + " "
+                else:
+                    chunks.append(current_chunk.strip())
+                    current_chunk = sentence + " "
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+    return chunks
+# Process documents and create vectordb
+documents = []
+for dataset_name in ragbench.keys():
+    for split in ragbench[dataset_name].keys():
+        original_documents = ragbench[dataset_name][split]['documents']
+        chunked_documents = chunk_documents_semantic(original_documents)
+        documents.extend([Document(page_content=chunk) for chunk in chunked_documents])
+# Initialize vectordb with processed documents
+vectordb = Chroma.from_documents(
+    documents=documents,
+    embedding=embedding_model,
+    persist_directory='./docs/chroma/'
+)
+vectordb.persist()
 def process_query(query, dataset_choice):
     try:
         logger.info(f"Processing query for {dataset_choice}: {query}")
         relevant_docs = vectordb.max_marginal_relevance_search(
             query,
+            k=5,
+            fetch_k=10
         )
         context = " ".join([doc.page_content for doc in relevant_docs])
         response = openai.chat.completions.create(
+            model="gpt-3.5-turbo",
             messages=[
                 {"role": "system", "content": "You are a specialized assistant for the RagBench dataset. Provide precise answers based solely on the given context."},
                 {"role": "user", "content": f"Dataset: {dataset_choice}\nContext: {context}\nQuestion: {query}\n\nProvide a detailed answer using only the information from the context above."}