Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -67,7 +67,10 @@ def get_embeddings():
|
|
67 |
|
68 |
def update_vectors(files, parser):
|
69 |
global uploaded_documents
|
|
|
|
|
70 |
if not files:
|
|
|
71 |
return "Please upload at least one PDF file.", gr.CheckboxGroup(
|
72 |
choices=[doc["name"] for doc in uploaded_documents],
|
73 |
value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
|
@@ -79,20 +82,33 @@ def update_vectors(files, parser):
|
|
79 |
|
80 |
all_data = []
|
81 |
for file in files:
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
if os.path.exists("faiss_database"):
|
|
|
90 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
91 |
database.add_documents(all_data)
|
92 |
else:
|
|
|
93 |
database = FAISS.from_documents(all_data, embed)
|
94 |
|
95 |
database.save_local("faiss_database")
|
|
|
96 |
|
97 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
|
98 |
choices=[doc["name"] for doc in uploaded_documents],
|
@@ -369,30 +385,45 @@ After writing the document, please provide a list of sources used in your respon
|
|
369 |
yield main_content, "" # Yield partial main content without sources
|
370 |
|
371 |
def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
|
|
|
|
|
372 |
embed = get_embeddings()
|
373 |
if os.path.exists("faiss_database"):
|
|
|
374 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
375 |
else:
|
|
|
376 |
yield "No documents available. Please upload PDF documents to answer questions."
|
377 |
return
|
378 |
|
379 |
retriever = database.as_retriever()
|
|
|
380 |
relevant_docs = retriever.get_relevant_documents(query)
|
|
|
381 |
|
382 |
# Filter relevant_docs based on selected documents
|
383 |
filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
|
|
|
384 |
|
385 |
if not filtered_docs:
|
|
|
386 |
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
387 |
return
|
388 |
|
|
|
|
|
|
|
|
|
389 |
context_str = "\n".join([doc.page_content for doc in filtered_docs])
|
|
|
390 |
|
391 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
|
|
392 |
# Use Cloudflare API with the retrieved context
|
393 |
for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
|
394 |
yield response
|
395 |
else:
|
|
|
396 |
# Use Hugging Face API
|
397 |
prompt = f"""Using the following context from the PDF documents:
|
398 |
{context_str}
|
@@ -402,6 +433,7 @@ Write a detailed and complete response that answers the following user question:
|
|
402 |
|
403 |
response = ""
|
404 |
for i in range(num_calls):
|
|
|
405 |
for message in client.chat_completion(
|
406 |
messages=[{"role": "user", "content": prompt}],
|
407 |
max_tokens=1000,
|
@@ -412,6 +444,8 @@ Write a detailed and complete response that answers the following user question:
|
|
412 |
chunk = message.choices[0].delta.content
|
413 |
response += chunk
|
414 |
yield response # Yield partial response
|
|
|
|
|
415 |
|
416 |
def vote(data: gr.LikeData):
|
417 |
if data.liked:
|
|
|
67 |
|
68 |
def update_vectors(files, parser):
|
69 |
global uploaded_documents
|
70 |
+
logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
|
71 |
+
|
72 |
if not files:
|
73 |
+
logging.warning("No files provided for update_vectors")
|
74 |
return "Please upload at least one PDF file.", gr.CheckboxGroup(
|
75 |
choices=[doc["name"] for doc in uploaded_documents],
|
76 |
value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
|
|
|
82 |
|
83 |
all_data = []
|
84 |
for file in files:
|
85 |
+
logging.info(f"Processing file: {file.name}")
|
86 |
+
try:
|
87 |
+
data = load_document(file, parser)
|
88 |
+
logging.info(f"Loaded {len(data)} chunks from {file.name}")
|
89 |
+
all_data.extend(data)
|
90 |
+
total_chunks += len(data)
|
91 |
+
# Append new documents instead of replacing
|
92 |
+
if not any(doc["name"] == file.name for doc in uploaded_documents):
|
93 |
+
uploaded_documents.append({"name": file.name, "selected": True})
|
94 |
+
logging.info(f"Added new document to uploaded_documents: {file.name}")
|
95 |
+
else:
|
96 |
+
logging.info(f"Document already exists in uploaded_documents: {file.name}")
|
97 |
+
except Exception as e:
|
98 |
+
logging.error(f"Error processing file {file.name}: {str(e)}")
|
99 |
+
|
100 |
+
logging.info(f"Total chunks processed: {total_chunks}")
|
101 |
|
102 |
if os.path.exists("faiss_database"):
|
103 |
+
logging.info("Updating existing FAISS database")
|
104 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
105 |
database.add_documents(all_data)
|
106 |
else:
|
107 |
+
logging.info("Creating new FAISS database")
|
108 |
database = FAISS.from_documents(all_data, embed)
|
109 |
|
110 |
database.save_local("faiss_database")
|
111 |
+
logging.info("FAISS database saved")
|
112 |
|
113 |
return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
|
114 |
choices=[doc["name"] for doc in uploaded_documents],
|
|
|
385 |
yield main_content, "" # Yield partial main content without sources
|
386 |
|
387 |
def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
|
388 |
+
logging.info(f"Entering get_response_from_pdf with query: {query}, model: {model}, selected_docs: {selected_docs}")
|
389 |
+
|
390 |
embed = get_embeddings()
|
391 |
if os.path.exists("faiss_database"):
|
392 |
+
logging.info("Loading FAISS database")
|
393 |
database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
|
394 |
else:
|
395 |
+
logging.warning("No FAISS database found")
|
396 |
yield "No documents available. Please upload PDF documents to answer questions."
|
397 |
return
|
398 |
|
399 |
retriever = database.as_retriever()
|
400 |
+
logging.info(f"Retrieving relevant documents for query: {query}")
|
401 |
relevant_docs = retriever.get_relevant_documents(query)
|
402 |
+
logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
|
403 |
|
404 |
# Filter relevant_docs based on selected documents
|
405 |
filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
|
406 |
+
logging.info(f"Number of filtered documents: {len(filtered_docs)}")
|
407 |
|
408 |
if not filtered_docs:
|
409 |
+
logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
|
410 |
yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
|
411 |
return
|
412 |
|
413 |
+
for doc in filtered_docs:
|
414 |
+
logging.info(f"Document source: {doc.metadata['source']}")
|
415 |
+
logging.info(f"Document content preview: {doc.page_content[:100]}...") # Log first 100 characters of each document
|
416 |
+
|
417 |
context_str = "\n".join([doc.page_content for doc in filtered_docs])
|
418 |
+
logging.info(f"Total context length: {len(context_str)}")
|
419 |
|
420 |
if model == "@cf/meta/llama-3.1-8b-instruct":
|
421 |
+
logging.info("Using Cloudflare API")
|
422 |
# Use Cloudflare API with the retrieved context
|
423 |
for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
|
424 |
yield response
|
425 |
else:
|
426 |
+
logging.info("Using Hugging Face API")
|
427 |
# Use Hugging Face API
|
428 |
prompt = f"""Using the following context from the PDF documents:
|
429 |
{context_str}
|
|
|
433 |
|
434 |
response = ""
|
435 |
for i in range(num_calls):
|
436 |
+
logging.info(f"API call {i+1}/{num_calls}")
|
437 |
for message in client.chat_completion(
|
438 |
messages=[{"role": "user", "content": prompt}],
|
439 |
max_tokens=1000,
|
|
|
444 |
chunk = message.choices[0].delta.content
|
445 |
response += chunk
|
446 |
yield response # Yield partial response
|
447 |
+
|
448 |
+
logging.info("Finished generating response")
|
449 |
|
450 |
def vote(data: gr.LikeData):
|
451 |
if data.liked:
|