SearchGPT

Running

App Files Files Community

Shreyas094 commited on Aug 4, 2024

Commit

0d9d94c

verified ·

1 Parent(s): 8142ec6

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -20

app.py CHANGED Viewed

@@ -111,9 +111,9 @@ def update_vectors(files, parser):
         return "Please upload at least one PDF file.", display_documents()
     embed = get_embeddings()
-    total_chunks = 0
     all_data = []
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
@@ -122,8 +122,14 @@ def update_vectors(files, parser):
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
             logging.info(f"Loaded {len(data)} chunks from {file.name}")
-            all_data.extend(data)
-            total_chunks += len(data)
             if not any(doc["name"] == file.name for doc in uploaded_documents):
                 uploaded_documents.append({"name": file.name, "selected": True})
                 logging.info(f"Added new document to uploaded_documents: {file.name}")
@@ -132,8 +138,6 @@ def update_vectors(files, parser):
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
-    logging.info(f"Total chunks processed: {total_chunks}")
     if not all_data:
         logging.warning("No valid data extracted from uploaded files")
         return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
@@ -153,6 +157,17 @@ def update_vectors(files, parser):
         database.save_local("faiss_database")
         logging.info("FAISS database saved")
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
         return f"Error updating vector store: {str(e)}", display_documents()
@@ -160,7 +175,28 @@ def update_vectors(files, parser):
     save_documents(uploaded_documents)
     logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
-    return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", display_documents()
 def delete_documents(selected_docs):
     global uploaded_documents
@@ -490,13 +526,17 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
         return
     try:
-        retriever = database.as_retriever()
         logging.info(f"Retrieving relevant documents for query: {query}")
         relevant_docs = retriever.get_relevant_documents(query)
         logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
-        # Filter relevant_docs based on selected documents
-        filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
         logging.info(f"Number of filtered documents: {len(filtered_docs)}")
         if not filtered_docs:
@@ -505,24 +545,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
             return
         for i, doc in enumerate(filtered_docs):
-            logging.info(f"Document {i+1} source: {doc.metadata['source']}")
-            logging.info(f"Document {i+1} content preview: {doc.page_content[:100]}...")
-        context_str = "\n".join([doc.page_content for doc in filtered_docs])
         logging.info(f"Total context length: {len(context_str)}")
         if model == "@cf/meta/llama-3.1-8b-instruct":
             logging.info("Using Cloudflare API")
-            # Use Cloudflare API with the retrieved context
-            for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
                 yield response
         else:
             logging.info("Using Hugging Face API")
-            # Use Hugging Face API
-            prompt = f"""Using the following context from the PDF documents:
-{context_str}
-Write a detailed and complete response that answers the following user question: '{query}'"""
             client = InferenceClient(model, token=huggingface_token)
             response = ""

         return "Please upload at least one PDF file.", display_documents()
     embed = get_embeddings()
     all_data = []
+    seen_contents = set()
     for file in files:
         logging.info(f"Processing file: {file.name}")
         try:
                 logging.warning(f"No chunks loaded from {file.name}")
                 continue
             logging.info(f"Loaded {len(data)} chunks from {file.name}")
+            for chunk in data:
+                if chunk.page_content not in seen_contents:
+                    all_data.append(chunk)
+                    seen_contents.add(chunk.page_content)
+                else:
+                    logging.warning(f"Duplicate content detected in {file.name}, skipping...")
             if not any(doc["name"] == file.name for doc in uploaded_documents):
                 uploaded_documents.append({"name": file.name, "selected": True})
                 logging.info(f"Added new document to uploaded_documents: {file.name}")
         except Exception as e:
             logging.error(f"Error processing file {file.name}: {str(e)}")
     if not all_data:
         logging.warning("No valid data extracted from uploaded files")
         return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
         database.save_local("faiss_database")
         logging.info("FAISS database saved")
+        # Check the database after updating
+        check_faiss_database()
+        # Analyze document similarity
+        analyze_document_similarity()
+        # Test document retrieval
+        test_document_retrieval("Tell me about the contents of the 8K filing")
+        test_document_retrieval("What information is in the 10Q report?")
     except Exception as e:
         logging.error(f"Error updating FAISS database: {str(e)}")
         return f"Error updating vector store: {str(e)}", display_documents()
     save_documents(uploaded_documents)
     logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
+    return f"Vector store updated successfully. Processed {len(all_data)} chunks from {len(files)} files using {parser}.", display_documents()
+from sklearn.metrics.pairwise import cosine_similarity
+def analyze_document_similarity():
+    embed = get_embeddings()
+    database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
+    docs = list(database.docstore.docs.values())
+    embeddings = [database.embedding_function(doc.page_content) for doc in docs]
+    similarity_matrix = cosine_similarity(embeddings)
+    for i in range(len(docs)):
+        for j in range(i+1, len(docs)):
+            similarity = similarity_matrix[i][j]
+            logging.info(f"Similarity between {docs[i].metadata['source']} and {docs[j].metadata['source']}: {similarity}")
+            if similarity > 0.9:  # Adjust this threshold as needed
+                logging.warning(f"High similarity detected between {docs[i].metadata['source']} and {docs[j].metadata['source']}")
+# Call this after updating the vector store
+analyze_document_similarity()
 def delete_documents(selected_docs):
     global uploaded_documents
         return
     try:
+        retriever = database.as_retriever(search_kwargs={"k": 10})  # Increase k to retrieve more documents
         logging.info(f"Retrieving relevant documents for query: {query}")
         relevant_docs = retriever.get_relevant_documents(query)
         logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
+        for i, doc in enumerate(relevant_docs):
+            logging.info(f"Relevant document {i+1}: {doc.metadata['source']}, Score: {doc.metadata.get('score', 'N/A')}")
+            logging.info(f"Relevant document {i+1} content preview: {doc.page_content[:100]}...")
+        # Filter relevant_docs based on selected documents, but keep original order
+        filtered_docs = [doc for doc in relevant_docs if any(selected_doc in doc.metadata["source"] for selected_doc in selected_docs)]
         logging.info(f"Number of filtered documents: {len(filtered_docs)}")
         if not filtered_docs:
             return
         for i, doc in enumerate(filtered_docs):
+            logging.info(f"Filtered document {i+1} source: {doc.metadata['source']}")
+            logging.info(f"Filtered document {i+1} content preview: {doc.page_content[:100]}...")
+        context_str = "\n\n".join([f"Document: {doc.metadata['source']}\n{doc.page_content}" for doc in filtered_docs])
         logging.info(f"Total context length: {len(context_str)}")
+        prompt = f"""You are analyzing multiple financial documents. The following documents have been selected: {', '.join(selected_docs)}
+Using the following context from the selected PDF documents:
+{context_str}
+Please provide a detailed and complete response that answers the following user question, making sure to consider information from all selected documents: '{query}'
+If the information is not found in the provided context, please state that clearly."""
         if model == "@cf/meta/llama-3.1-8b-instruct":
             logging.info("Using Cloudflare API")
+            for response in get_response_from_cloudflare(prompt=prompt, context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
                 yield response
         else:
             logging.info("Using Hugging Face API")
             client = InferenceClient(model, token=huggingface_token)
             response = ""