SearchGPT

Running

App Files Files Community

Shreyas094 commited on Jul 29, 2024

Commit

ce5bb9f

verified ·

1 Parent(s): eed06df

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -6

app.py CHANGED Viewed

@@ -67,7 +67,10 @@ def get_embeddings():
 def update_vectors(files, parser):
     global uploaded_documents
     if not files:
         return "Please upload at least one PDF file.", gr.CheckboxGroup(
             choices=[doc["name"] for doc in uploaded_documents],
             value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
@@ -79,20 +82,33 @@ def update_vectors(files, parser):
     all_data = []
     for file in files:
-        data = load_document(file, parser)
-        all_data.extend(data)
-        total_chunks += len(data)
-        # Append new documents instead of replacing
-        if not any(doc["name"] == file.name for doc in uploaded_documents):
-            uploaded_documents.append({"name": file.name, "selected": True})
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
         database.add_documents(all_data)
     else:
         database = FAISS.from_documents(all_data, embed)
     database.save_local("faiss_database")
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
         choices=[doc["name"] for doc in uploaded_documents],
@@ -369,30 +385,45 @@ After writing the document, please provide a list of sources used in your respon
                     yield main_content, ""  # Yield partial main content without sources
 def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
     else:
         yield "No documents available. Please upload PDF documents to answer questions."
         return
     retriever = database.as_retriever()
     relevant_docs = retriever.get_relevant_documents(query)
     # Filter relevant_docs based on selected documents
     filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
     if not filtered_docs:
         yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
         return
     context_str = "\n".join([doc.page_content for doc in filtered_docs])
     if model == "@cf/meta/llama-3.1-8b-instruct":
         # Use Cloudflare API with the retrieved context
         for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
             yield response
     else:
         # Use Hugging Face API
         prompt = f"""Using the following context from the PDF documents:
 {context_str}
@@ -402,6 +433,7 @@ Write a detailed and complete response that answers the following user question:
         response = ""
         for i in range(num_calls):
             for message in client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=1000,
@@ -412,6 +444,8 @@ Write a detailed and complete response that answers the following user question:
                     chunk = message.choices[0].delta.content
                     response += chunk
                     yield response  # Yield partial response
 def vote(data: gr.LikeData):
     if data.liked:

 def update_vectors(files, parser):
     global uploaded_documents
+    logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
     if not files:
+        logging.warning("No files provided for update_vectors")
         return "Please upload at least one PDF file.", gr.CheckboxGroup(
             choices=[doc["name"] for doc in uploaded_documents],
             value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
     all_data = []
     for file in files:
+        logging.info(f"Processing file: {file.name}")
+        try:
+            data = load_document(file, parser)
+            logging.info(f"Loaded {len(data)} chunks from {file.name}")
+            all_data.extend(data)
+            total_chunks += len(data)
+            # Append new documents instead of replacing
+            if not any(doc["name"] == file.name for doc in uploaded_documents):
+                uploaded_documents.append({"name": file.name, "selected": True})
+                logging.info(f"Added new document to uploaded_documents: {file.name}")
+            else:
+                logging.info(f"Document already exists in uploaded_documents: {file.name}")
+        except Exception as e:
+            logging.error(f"Error processing file {file.name}: {str(e)}")
+    logging.info(f"Total chunks processed: {total_chunks}")
     if os.path.exists("faiss_database"):
+        logging.info("Updating existing FAISS database")
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
         database.add_documents(all_data)
     else:
+        logging.info("Creating new FAISS database")
         database = FAISS.from_documents(all_data, embed)
     database.save_local("faiss_database")
+    logging.info("FAISS database saved")
     return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
         choices=[doc["name"] for doc in uploaded_documents],
                     yield main_content, ""  # Yield partial main content without sources
 def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
+    logging.info(f"Entering get_response_from_pdf with query: {query}, model: {model}, selected_docs: {selected_docs}")
     embed = get_embeddings()
     if os.path.exists("faiss_database"):
+        logging.info("Loading FAISS database")
         database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
     else:
+        logging.warning("No FAISS database found")
         yield "No documents available. Please upload PDF documents to answer questions."
         return
     retriever = database.as_retriever()
+    logging.info(f"Retrieving relevant documents for query: {query}")
     relevant_docs = retriever.get_relevant_documents(query)
+    logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
     # Filter relevant_docs based on selected documents
     filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
+    logging.info(f"Number of filtered documents: {len(filtered_docs)}")
     if not filtered_docs:
+        logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
         yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
         return
+    for doc in filtered_docs:
+        logging.info(f"Document source: {doc.metadata['source']}")
+        logging.info(f"Document content preview: {doc.page_content[:100]}...")  # Log first 100 characters of each document
     context_str = "\n".join([doc.page_content for doc in filtered_docs])
+    logging.info(f"Total context length: {len(context_str)}")
     if model == "@cf/meta/llama-3.1-8b-instruct":
+        logging.info("Using Cloudflare API")
         # Use Cloudflare API with the retrieved context
         for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
             yield response
     else:
+        logging.info("Using Hugging Face API")
         # Use Hugging Face API
         prompt = f"""Using the following context from the PDF documents:
 {context_str}
         response = ""
         for i in range(num_calls):
+            logging.info(f"API call {i+1}/{num_calls}")
             for message in client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=1000,
                     chunk = message.choices[0].delta.content
                     response += chunk
                     yield response  # Yield partial response
+        logging.info("Finished generating response")
 def vote(data: gr.LikeData):
     if data.liked: