Shreyas094 commited on
Commit
ce5bb9f
·
verified ·
1 Parent(s): eed06df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -6
app.py CHANGED
@@ -67,7 +67,10 @@ def get_embeddings():
67
 
68
  def update_vectors(files, parser):
69
  global uploaded_documents
 
 
70
  if not files:
 
71
  return "Please upload at least one PDF file.", gr.CheckboxGroup(
72
  choices=[doc["name"] for doc in uploaded_documents],
73
  value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
@@ -79,20 +82,33 @@ def update_vectors(files, parser):
79
 
80
  all_data = []
81
  for file in files:
82
- data = load_document(file, parser)
83
- all_data.extend(data)
84
- total_chunks += len(data)
85
- # Append new documents instead of replacing
86
- if not any(doc["name"] == file.name for doc in uploaded_documents):
87
- uploaded_documents.append({"name": file.name, "selected": True})
 
 
 
 
 
 
 
 
 
 
88
 
89
  if os.path.exists("faiss_database"):
 
90
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
91
  database.add_documents(all_data)
92
  else:
 
93
  database = FAISS.from_documents(all_data, embed)
94
 
95
  database.save_local("faiss_database")
 
96
 
97
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
98
  choices=[doc["name"] for doc in uploaded_documents],
@@ -369,30 +385,45 @@ After writing the document, please provide a list of sources used in your respon
369
  yield main_content, "" # Yield partial main content without sources
370
 
371
  def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
 
 
372
  embed = get_embeddings()
373
  if os.path.exists("faiss_database"):
 
374
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
375
  else:
 
376
  yield "No documents available. Please upload PDF documents to answer questions."
377
  return
378
 
379
  retriever = database.as_retriever()
 
380
  relevant_docs = retriever.get_relevant_documents(query)
 
381
 
382
  # Filter relevant_docs based on selected documents
383
  filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
 
384
 
385
  if not filtered_docs:
 
386
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
387
  return
388
 
 
 
 
 
389
  context_str = "\n".join([doc.page_content for doc in filtered_docs])
 
390
 
391
  if model == "@cf/meta/llama-3.1-8b-instruct":
 
392
  # Use Cloudflare API with the retrieved context
393
  for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
394
  yield response
395
  else:
 
396
  # Use Hugging Face API
397
  prompt = f"""Using the following context from the PDF documents:
398
  {context_str}
@@ -402,6 +433,7 @@ Write a detailed and complete response that answers the following user question:
402
 
403
  response = ""
404
  for i in range(num_calls):
 
405
  for message in client.chat_completion(
406
  messages=[{"role": "user", "content": prompt}],
407
  max_tokens=1000,
@@ -412,6 +444,8 @@ Write a detailed and complete response that answers the following user question:
412
  chunk = message.choices[0].delta.content
413
  response += chunk
414
  yield response # Yield partial response
 
 
415
 
416
  def vote(data: gr.LikeData):
417
  if data.liked:
 
67
 
68
  def update_vectors(files, parser):
69
  global uploaded_documents
70
+ logging.info(f"Entering update_vectors with {len(files)} files and parser: {parser}")
71
+
72
  if not files:
73
+ logging.warning("No files provided for update_vectors")
74
  return "Please upload at least one PDF file.", gr.CheckboxGroup(
75
  choices=[doc["name"] for doc in uploaded_documents],
76
  value=[doc["name"] for doc in uploaded_documents if doc["selected"]],
 
82
 
83
  all_data = []
84
  for file in files:
85
+ logging.info(f"Processing file: {file.name}")
86
+ try:
87
+ data = load_document(file, parser)
88
+ logging.info(f"Loaded {len(data)} chunks from {file.name}")
89
+ all_data.extend(data)
90
+ total_chunks += len(data)
91
+ # Append new documents instead of replacing
92
+ if not any(doc["name"] == file.name for doc in uploaded_documents):
93
+ uploaded_documents.append({"name": file.name, "selected": True})
94
+ logging.info(f"Added new document to uploaded_documents: {file.name}")
95
+ else:
96
+ logging.info(f"Document already exists in uploaded_documents: {file.name}")
97
+ except Exception as e:
98
+ logging.error(f"Error processing file {file.name}: {str(e)}")
99
+
100
+ logging.info(f"Total chunks processed: {total_chunks}")
101
 
102
  if os.path.exists("faiss_database"):
103
+ logging.info("Updating existing FAISS database")
104
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
105
  database.add_documents(all_data)
106
  else:
107
+ logging.info("Creating new FAISS database")
108
  database = FAISS.from_documents(all_data, embed)
109
 
110
  database.save_local("faiss_database")
111
+ logging.info("FAISS database saved")
112
 
113
  return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files using {parser}.", gr.CheckboxGroup(
114
  choices=[doc["name"] for doc in uploaded_documents],
 
385
  yield main_content, "" # Yield partial main content without sources
386
 
387
  def get_response_from_pdf(query, model, selected_docs, num_calls=3, temperature=0.2):
388
+ logging.info(f"Entering get_response_from_pdf with query: {query}, model: {model}, selected_docs: {selected_docs}")
389
+
390
  embed = get_embeddings()
391
  if os.path.exists("faiss_database"):
392
+ logging.info("Loading FAISS database")
393
  database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
394
  else:
395
+ logging.warning("No FAISS database found")
396
  yield "No documents available. Please upload PDF documents to answer questions."
397
  return
398
 
399
  retriever = database.as_retriever()
400
+ logging.info(f"Retrieving relevant documents for query: {query}")
401
  relevant_docs = retriever.get_relevant_documents(query)
402
+ logging.info(f"Number of relevant documents retrieved: {len(relevant_docs)}")
403
 
404
  # Filter relevant_docs based on selected documents
405
  filtered_docs = [doc for doc in relevant_docs if doc.metadata["source"] in selected_docs]
406
+ logging.info(f"Number of filtered documents: {len(filtered_docs)}")
407
 
408
  if not filtered_docs:
409
+ logging.warning(f"No relevant information found in the selected documents: {selected_docs}")
410
  yield "No relevant information found in the selected documents. Please try selecting different documents or rephrasing your query."
411
  return
412
 
413
+ for doc in filtered_docs:
414
+ logging.info(f"Document source: {doc.metadata['source']}")
415
+ logging.info(f"Document content preview: {doc.page_content[:100]}...") # Log first 100 characters of each document
416
+
417
  context_str = "\n".join([doc.page_content for doc in filtered_docs])
418
+ logging.info(f"Total context length: {len(context_str)}")
419
 
420
  if model == "@cf/meta/llama-3.1-8b-instruct":
421
+ logging.info("Using Cloudflare API")
422
  # Use Cloudflare API with the retrieved context
423
  for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
424
  yield response
425
  else:
426
+ logging.info("Using Hugging Face API")
427
  # Use Hugging Face API
428
  prompt = f"""Using the following context from the PDF documents:
429
  {context_str}
 
433
 
434
  response = ""
435
  for i in range(num_calls):
436
+ logging.info(f"API call {i+1}/{num_calls}")
437
  for message in client.chat_completion(
438
  messages=[{"role": "user", "content": prompt}],
439
  max_tokens=1000,
 
444
  chunk = message.choices[0].delta.content
445
  response += chunk
446
  yield response # Yield partial response
447
+
448
+ logging.info("Finished generating response")
449
 
450
  def vote(data: gr.LikeData):
451
  if data.liked: