Shreyas094 commited on
Commit
bb706d3
·
verified ·
1 Parent(s): 459b8b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -26
app.py CHANGED
@@ -2,8 +2,7 @@ import os
2
  import json
3
  import gradio as gr
4
  import pandas as pd
5
- import tempfile
6
- from typing import List
7
 
8
  from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_community.vectorstores import FAISS
@@ -11,31 +10,25 @@ from langchain_community.document_loaders import PyPDFLoader
11
  from langchain_core.output_parsers import StrOutputParser
12
  from langchain_community.embeddings import HuggingFaceEmbeddings
13
  from langchain_community.llms import HuggingFaceHub
14
- from langchain_text_splitters import RecursiveCharacterTextSplitter
15
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
16
- from langchain_core.documents import Document
17
 
18
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
19
 
20
- def load_and_split_document(file: tempfile._TemporaryFileWrapper) -> List[Document]:
21
- """Loads and splits the document into chunks."""
22
  loader = PyPDFLoader(file.name)
23
- pages = loader.load()
24
-
25
- text_splitter = RecursiveCharacterTextSplitter(
26
- chunk_size=1000,
27
- chunk_overlap=200,
28
- length_function=len,
29
- )
30
-
31
- chunks = text_splitter.split_documents(pages)
32
- return chunks
33
 
34
  def get_embeddings():
35
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
36
 
37
- def create_database(data: List[Document], embeddings):
38
- db = FAISS.from_documents(data, embeddings)
 
 
 
 
39
  db.save_local("faiss_database")
40
 
41
  prompt = """
@@ -74,13 +67,19 @@ def response(database, model, question):
74
  ans = generate_chunked_response(model, formatted_prompt)
75
  return ans
76
 
77
- def update_vectors(file):
78
- if file is None:
79
- return "Please upload a PDF file."
80
- data = load_and_split_document(file)
81
  embed = get_embeddings()
82
- create_database(data, embed)
83
- return f"Vector store updated successfully. Processed {len(data)} chunks."
 
 
 
 
 
 
84
 
85
  def ask_question(question):
86
  if not question:
@@ -98,7 +97,7 @@ def extract_db_to_excel():
98
  data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
99
  df = pd.DataFrame(data)
100
 
101
- with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
102
  excel_path = tmp.name
103
  df.to_excel(excel_path, index=False)
104
 
@@ -109,7 +108,7 @@ with gr.Blocks() as demo:
109
  gr.Markdown("# Chat with your PDF documents")
110
 
111
  with gr.Row():
112
- file_input = gr.File(label="Upload your PDF document", file_types=[".pdf"])
113
  update_button = gr.Button("Update Vector Store")
114
 
115
  update_output = gr.Textbox(label="Update Status")
 
2
  import json
3
  import gradio as gr
4
  import pandas as pd
5
+ from tempfile import NamedTemporaryFile
 
6
 
7
  from langchain_core.prompts import ChatPromptTemplate
8
  from langchain_community.vectorstores import FAISS
 
10
  from langchain_core.output_parsers import StrOutputParser
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain_community.llms import HuggingFaceHub
 
13
  from langchain_core.runnables import RunnableParallel, RunnablePassthrough
 
14
 
15
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
16
 
17
+ def load_and_split_document(file):
18
+ """Loads and splits the document into pages."""
19
  loader = PyPDFLoader(file.name)
20
+ data = loader.load_and_split()
21
+ return data
 
 
 
 
 
 
 
 
22
 
23
  def get_embeddings():
24
  return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
25
 
26
+ def create_or_update_database(data, embeddings):
27
+ if os.path.exists("faiss_database"):
28
+ db = FAISS.load_local("faiss_database", embeddings)
29
+ db.add_documents(data)
30
+ else:
31
+ db = FAISS.from_documents(data, embeddings)
32
  db.save_local("faiss_database")
33
 
34
  prompt = """
 
67
  ans = generate_chunked_response(model, formatted_prompt)
68
  return ans
69
 
70
+ def update_vectors(files):
71
+ if not files:
72
+ return "Please upload at least one PDF file."
73
+
74
  embed = get_embeddings()
75
+ total_chunks = 0
76
+
77
+ for file in files:
78
+ data = load_and_split_document(file)
79
+ create_or_update_database(data, embed)
80
+ total_chunks += len(data)
81
+
82
+ return f"Vector store updated successfully. Processed {total_chunks} chunks from {len(files)} files."
83
 
84
  def ask_question(question):
85
  if not question:
 
97
  data = [{"page_content": doc.page_content, "metadata": json.dumps(doc.metadata)} for doc in documents]
98
  df = pd.DataFrame(data)
99
 
100
+ with NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
101
  excel_path = tmp.name
102
  df.to_excel(excel_path, index=False)
103
 
 
108
  gr.Markdown("# Chat with your PDF documents")
109
 
110
  with gr.Row():
111
+ file_input = gr.File(label="Upload your PDF documents", file_types=[".pdf"], multiple=True)
112
  update_button = gr.Button("Update Vector Store")
113
 
114
  update_output = gr.Textbox(label="Update Status")