Spaces:

abhinavyadav11
/

RAG_Enhanced_Chatbot

Sleeping

App Files Files Community

abhinavyadav11 commited on Dec 31, 2024

Commit

4fe3a6c

verified ·

1 Parent(s): 5ebd830

Upload 2 files

Browse files

Files changed (2) hide show

app.py +113 -0
main.py +64 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+import os
+from dotenv import load_dotenv
+from transformers import pipeline
+from io import BytesIO
+from pypdf import PdfReader
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from main import get_index_for_pdf  # Assuming 'main.py' contains this function
+# Initialize session state for the app
+if "vectordb" not in st.session_state:
+    st.session_state["vectordb"] = None
+if "prompt" not in st.session_state:
+    st.session_state["prompt"] = [{"role": "system", "content": "none"}]
+# Set the title for the Streamlit app
+st.title("RAG Enhance Chatbot")
+# Hugging Face API Key (avoid hardcoding for production)
+load_dotenv()
+HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
+# st.title('Model Configuration')
+# model_name = st.sidebar.selectbox(
+#     "Choose a Hugging Face Model",
+#     [
+#         "sentence-transformers/all-mpnet-base-v2",
+#         "sentence-transformers/all-MiniLM-L6-v2",
+#         "msmarco-distilbert-base-tas-b",
+#         "deepset/roberta-large-squad2",
+#         "facebook/dpr-ctx_encoder-single-nq-base"
+#     ],
+#     index=0  # Default model
+# )
+# Define the QA pipeline
+qa_pipeline = pipeline(
+    "question-answering",
+    model="deepset/roberta-base-squad2",  # Replace with your desired model
+    use_auth_token=HUGGINGFACE_API_KEY
+)
+# Define a prompt template for the assistant
+prompt_template = """
+    You are a helpful Assistant who answers users' questions based on PDF extracts.
+    Keep your answer lengthy and if long make points.
+    Context information includes 'filename' and 'page'. Always reference these in your responses.
+    If the text is irrelevant or insufficient to answer, respond with "Not applicable."
+    The provided PDF content is:
+    {pdf_extract}
+"""
+# Cached function to create a vector database for the provided PDF files
+@st.cache_data
+def create_vectordb(files, filenames, huggingface_model_name):
+    # Show a spinner while creating the vector database
+    with st.spinner("Creating Vector Database..."):
+        vectordb = get_index_for_pdf(
+            [file.getvalue() for file in files], filenames, huggingface_model_name
+        )
+    return vectordb
+# Upload PDF files using Streamlit file uploader
+pdf_files = st.file_uploader("Upload your PDFs", type="pdf", accept_multiple_files=True)
+# If PDF files are uploaded, create the vector database and store it in the session state
+if pdf_files:
+    pdf_file_names = [file.name for file in pdf_files]
+    huggingface_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Correct model name
+    st.session_state["vectordb"] = create_vectordb(pdf_files, pdf_file_names, huggingface_model_name)
+# Display previous chat messages
+for message in st.session_state["prompt"]:
+    if message["role"] != "system":
+        with st.chat_message(message["role"]):
+            st.write(message["content"])
+# Get the user's question using Streamlit chat input
+question = st.chat_input("Ask anything")
+# Handle the user's question
+if question:
+    vectordb = st.session_state.get("vectordb", None)
+    if not vectordb:
+        with st.chat_message("assistant"):
+            st.write("You need to upload a PDF first.")
+            st.stop()
+    # Search the vector database for similar content to the user's question
+    search_results = vectordb.similarity_search(question, k=3)
+    pdf_extract = "\n".join(
+        [
+            f"{result.page_content} (Filename: {result.metadata['filename']}, Page: {result.metadata['page']})"
+            for result in search_results
+        ]
+    )
+    # Use the QA pipeline with the context
+    response = qa_pipeline(question=question, context=pdf_extract)
+    # Update the assistant's response
+    with st.chat_message("assistant"):
+        st.write(response["answer"])
+    # Update the session state prompt
+    st.session_state["prompt"].append({"role": "user", "content": question})
+    st.session_state["prompt"].append({"role": "assistant", "content": response["answer"]})

main.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import re
+from io import BytesIO
+from typing import Tuple, List
+import pickle
+from langchain.docstore.document import Document
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores.faiss import FAISS
+from pypdf import PdfReader
+import faiss
+def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]:
+    pdf = PdfReader(file)
+    output = []
+    for page in pdf.pages:
+        text = page.extract_text()
+        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
+        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
+        text = re.sub(r"\n\s*\n", "\n\n", text)
+        output.append(text)
+    return output, filename
+def text_to_docs(text: List[str], filename: str) -> List[Document]:
+    if isinstance(text, str):
+        text = [text]
+    page_docs = [Document(page_content=page) for page in text]
+    for i, doc in enumerate(page_docs):
+        doc.metadata["page"] = i + 1
+    doc_chunks = []
+    for doc in page_docs:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=4000,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
+            chunk_overlap=0,
+        )
+        chunks = text_splitter.split_text(doc.page_content)
+        for i, chunk in enumerate(chunks):
+            doc = Document(
+                page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
+            )
+            doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
+            doc.metadata["filename"] = filename  # Add filename to metadata
+            doc_chunks.append(doc)
+    return doc_chunks
+def docs_to_index(docs, huggingface_model_name):
+    # Using Hugging Face embeddings
+    embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+    index = FAISS.from_documents(docs, embedding_model)
+    return index
+def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name):
+    documents = []
+    for pdf_file, pdf_name in zip(pdf_files, pdf_names):
+        text, filename = parse_pdf(BytesIO(pdf_file), pdf_name)
+        documents = documents + text_to_docs(text, filename)
+    index = docs_to_index(documents, huggingface_model_name)
+    return index