Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -111,9 +111,9 @@ def update_vectors(files, parser):
|
|
111 |
return "Please upload at least one PDF file.", display_documents()
|
112 |
|
113 |
embed = get_embeddings()
|
114 |
-
total_chunks = 0
|
115 |
-
|
116 |
all_data = []
|
|
|
|
|
117 |
for file in files:
|
118 |
logging.info(f"Processing file: {file.name}")
|
119 |
try:
|
@@ -122,8 +122,14 @@ def update_vectors(files, parser):
|
|
122 |
logging.warning(f"No chunks loaded from {file.name}")
|
123 |
continue
|
124 |
logging.info(f"Loaded {len(data)} chunks from {file.name}")
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
if not any(doc["name"] == file.name for doc in uploaded_documents):
|
128 |
uploaded_documents.append({"name": file.name, "selected": True})
|
129 |
logging.info(f"Added new document to uploaded_documents: {file.name}")
|
@@ -132,8 +138,6 @@ def update_vectors(files, parser):
|
|
132 |
except Exception as e:
|
133 |
logging.error(f"Error processing file {file.name}: {str(e)}")
|
134 |
|
135 |
-
logging.info(f"Total chunks processed: {total_chunks}")
|
136 |
-
|
137 |
if not all_data:
|
138 |
logging.warning("No valid data extracted from uploaded files")
|
139 |
return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
|
@@ -153,6 +157,17 @@ def update_vectors(files, parser):
|
|
153 |
|
154 |
database.save_local("faiss_database")
|
155 |
logging.info("FAISS database saved")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
except Exception as e:
|
157 |
logging.error(f"Error updating FAISS database: {str(e)}")
|
158 |
return f"Error updating vector store: {str(e)}", display_documents()
|
@@ -160,7 +175,28 @@ def update_vectors(files, parser):
|
|
160 |
save_documents(uploaded_documents)
|
161 |
logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
|
162 |
|
163 |
-
return f"Vector store updated successfully. Processed {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
def delete_documents(selected_docs):
|
166 |
global uploaded_documents
|
@@ -490,13 +526,17 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
|
|
490 |
return
|
491 |
|
492 |
try:
|
493 |
-
retriever = database.as_retriever()
|
494 |
logging.info(f"Retrieving relevant documents for query: {query}")
|
495 |
relevant_docs = retriever.get_relevant_documents(query)
|
496 |
logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
|
497 |
|
498 |
-
|
499 |
-
|
|
|
|
|
|
|
|
|
500 |
logging.info(f"Number of filtered documents: {len(filtered_docs)}")
|
501 |
|
502 |
if not filtered_docs:
|
@@ -505,24 +545,28 @@ def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=
|
|
505 |
return
|
506 |
|
507 |
for i, doc in enumerate(filtered_docs):
|
508 |
-
logging.info(f"
|
509 |
-
logging.info(f"
|
510 |
|
511 |
-
context_str = "\n".join([doc.page_content for doc in filtered_docs])
|
512 |
logging.info(f"Total context length: {len(context_str)}")
|
513 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
514 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
515 |
logging.info("Using Cloudflare API")
|
516 |
-
|
517 |
-
for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
|
518 |
yield response
|
519 |
else:
|
520 |
logging.info("Using Hugging Face API")
|
521 |
-
# Use Hugging Face API
|
522 |
-
prompt = f"""Using the following context from the PDF documents:
|
523 |
-
{context_str}
|
524 |
-
Write a detailed and complete response that answers the following user question: '{query}'"""
|
525 |
-
|
526 |
client = InferenceClient(model, token=huggingface_token)
|
527 |
|
528 |
response = ""
|
|
|
111 |
return "Please upload at least one PDF file.", display_documents()
|
112 |
|
113 |
embed = get_embeddings()
|
|
|
|
|
114 |
all_data = []
|
115 |
+
seen_contents = set()
|
116 |
+
|
117 |
for file in files:
|
118 |
logging.info(f"Processing file: {file.name}")
|
119 |
try:
|
|
|
122 |
logging.warning(f"No chunks loaded from {file.name}")
|
123 |
continue
|
124 |
logging.info(f"Loaded {len(data)} chunks from {file.name}")
|
125 |
+
|
126 |
+
for chunk in data:
|
127 |
+
if chunk.page_content not in seen_contents:
|
128 |
+
all_data.append(chunk)
|
129 |
+
seen_contents.add(chunk.page_content)
|
130 |
+
else:
|
131 |
+
logging.warning(f"Duplicate content detected in {file.name}, skipping...")
|
132 |
+
|
133 |
if not any(doc["name"] == file.name for doc in uploaded_documents):
|
134 |
uploaded_documents.append({"name": file.name, "selected": True})
|
135 |
logging.info(f"Added new document to uploaded_documents: {file.name}")
|
|
|
138 |
except Exception as e:
|
139 |
logging.error(f"Error processing file {file.name}: {str(e)}")
|
140 |
|
|
|
|
|
141 |
if not all_data:
|
142 |
logging.warning("No valid data extracted from uploaded files")
|
143 |
return "No valid data could be extracted from the uploaded files. Please check the file contents and try again.", display_documents()
|
|
|
157 |
|
158 |
database.save_local("faiss_database")
|
159 |
logging.info("FAISS database saved")
|
160 |
+
|
161 |
+
# Check the database after updating
|
162 |
+
check_faiss_database()
|
163 |
+
|
164 |
+
# Analyze document similarity
|
165 |
+
analyze_document_similarity()
|
166 |
+
|
167 |
+
# Test document retrieval
|
168 |
+
test_document_retrieval("Tell me about the contents of the 8K filing")
|
169 |
+
test_document_retrieval("What information is in the 10Q report?")
|
170 |
+
|
171 |
except Exception as e:
|
172 |
logging.error(f"Error updating FAISS database: {str(e)}")
|
173 |
return f"Error updating vector store: {str(e)}", display_documents()
|
|
|
175 |
save_documents(uploaded_documents)
|
176 |
logging.info(f"Updated documents saved. Total documents: {len(uploaded_documents)}")
|
177 |
|
178 |
+
return f"Vector store updated successfully. Processed {len(all_data)} chunks from {len(files)} files using {parser}.", display_documents()
|
179 |
+
|
180 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
181 |
+
|
182 |
+
def analyze_document_similarity():
|
183 |
+
embed = get_embeddings()
|
184 |
+
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
185 |
+
|
186 |
+
docs = list(database.docstore.docs.values())
|
187 |
+
embeddings = [database.embedding_function(doc.page_content) for doc in docs]
|
188 |
+
|
189 |
+
similarity_matrix = cosine_similarity(embeddings)
|
190 |
+
|
191 |
+
for i in range(len(docs)):
|
192 |
+
for j in range(i+1, len(docs)):
|
193 |
+
similarity = similarity_matrix[i][j]
|
194 |
+
logging.info(f"Similarity between {docs[i].metadata['source']} and {docs[j].metadata['source']}: {similarity}")
|
195 |
+
if similarity > 0.9: # Adjust this threshold as needed
|
196 |
+
logging.warning(f"High similarity detected between {docs[i].metadata['source']} and {docs[j].metadata['source']}")
|
197 |
+
|
198 |
+
# Call this after updating the vector store
|
199 |
+
analyze_document_similarity()
|
200 |
|
201 |
def delete_documents(selected_docs):
|
202 |
global uploaded_documents
|
|
|
526 |
return
|
527 |
|
528 |
try:
|
529 |
+
retriever = database.as_retriever(search_kwargs={"k": 10}) # Increase k to retrieve more documents
|
530 |
logging.info(f"Retrieving relevant documents for query: {query}")
|
531 |
relevant_docs = retriever.get_relevant_documents(query)
|
532 |
logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
|
533 |
|
534 |
+
for i, doc in enumerate(relevant_docs):
|
535 |
+
logging.info(f"Relevant document {i+1}: {doc.metadata['source']}, Score: {doc.metadata.get('score', 'N/A')}")
|
536 |
+
logging.info(f"Relevant document {i+1} content preview: {doc.page_content[:100]}...")
|
537 |
+
|
538 |
+
# Filter relevant_docs based on selected documents, but keep original order
|
539 |
+
filtered_docs = [doc for doc in relevant_docs if any(selected_doc in doc.metadata["source"] for selected_doc in selected_docs)]
|
540 |
logging.info(f"Number of filtered documents: {len(filtered_docs)}")
|
541 |
|
542 |
if not filtered_docs:
|
|
|
545 |
return
|
546 |
|
547 |
for i, doc in enumerate(filtered_docs):
|
548 |
+
logging.info(f"Filtered document {i+1} source: {doc.metadata['source']}")
|
549 |
+
logging.info(f"Filtered document {i+1} content preview: {doc.page_content[:100]}...")
|
550 |
|
551 |
+
context_str = "\n\n".join([f"Document: {doc.metadata['source']}\n{doc.page_content}" for doc in filtered_docs])
|
552 |
logging.info(f"Total context length: {len(context_str)}")
|
553 |
|
554 |
+
prompt = f"""You are analyzing multiple financial documents. The following documents have been selected: {', '.join(selected_docs)}
|
555 |
+
|
556 |
+
Using the following context from the selected PDF documents:
|
557 |
+
|
558 |
+
{context_str}
|
559 |
+
|
560 |
+
Please provide a detailed and complete response that answers the following user question, making sure to consider information from all selected documents: '{query}'
|
561 |
+
|
562 |
+
If the information is not found in the provided context, please state that clearly."""
|
563 |
+
|
564 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
565 |
logging.info("Using Cloudflare API")
|
566 |
+
for response in get_response_from_cloudflare(prompt=prompt, context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
|
|
|
567 |
yield response
|
568 |
else:
|
569 |
logging.info("Using Hugging Face API")
|
|
|
|
|
|
|
|
|
|
|
570 |
client = InferenceClient(model, token=huggingface_token)
|
571 |
|
572 |
response = ""
|