abhinavyadav11's picture
Upload 2 files
4fe3a6c verified
import streamlit as st
import os
from dotenv import load_dotenv
from transformers import pipeline
from io import BytesIO
from pypdf import PdfReader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from main import get_index_for_pdf # Assuming 'main.py' contains this function
# Initialize session state for the app
if "vectordb" not in st.session_state:
st.session_state["vectordb"] = None
if "prompt" not in st.session_state:
st.session_state["prompt"] = [{"role": "system", "content": "none"}]
# Set the title for the Streamlit app
st.title("RAG Enhance Chatbot")
# Hugging Face API Key (avoid hardcoding for production)
load_dotenv()
HUGGINGFACE_API_KEY = os.getenv("HUGGINGFACE_API_KEY")
# st.title('Model Configuration')
# model_name = st.sidebar.selectbox(
# "Choose a Hugging Face Model",
# [
# "sentence-transformers/all-mpnet-base-v2",
# "sentence-transformers/all-MiniLM-L6-v2",
# "msmarco-distilbert-base-tas-b",
# "deepset/roberta-large-squad2",
# "facebook/dpr-ctx_encoder-single-nq-base"
# ],
# index=0 # Default model
# )
# Define the QA pipeline
qa_pipeline = pipeline(
"question-answering",
model="deepset/roberta-base-squad2", # Replace with your desired model
use_auth_token=HUGGINGFACE_API_KEY
)
# Define a prompt template for the assistant
prompt_template = """
You are a helpful Assistant who answers users' questions based on PDF extracts.
Keep your answer lengthy and if long make points.
Context information includes 'filename' and 'page'. Always reference these in your responses.
If the text is irrelevant or insufficient to answer, respond with "Not applicable."
The provided PDF content is:
{pdf_extract}
"""
# Cached function to create a vector database for the provided PDF files
@st.cache_data
def create_vectordb(files, filenames, huggingface_model_name):
# Show a spinner while creating the vector database
with st.spinner("Creating Vector Database..."):
vectordb = get_index_for_pdf(
[file.getvalue() for file in files], filenames, huggingface_model_name
)
return vectordb
# Upload PDF files using Streamlit file uploader
pdf_files = st.file_uploader("Upload your PDFs", type="pdf", accept_multiple_files=True)
# If PDF files are uploaded, create the vector database and store it in the session state
if pdf_files:
pdf_file_names = [file.name for file in pdf_files]
huggingface_model_name = "sentence-transformers/all-MiniLM-L6-v2" # Correct model name
st.session_state["vectordb"] = create_vectordb(pdf_files, pdf_file_names, huggingface_model_name)
# Display previous chat messages
for message in st.session_state["prompt"]:
if message["role"] != "system":
with st.chat_message(message["role"]):
st.write(message["content"])
# Get the user's question using Streamlit chat input
question = st.chat_input("Ask anything")
# Handle the user's question
if question:
vectordb = st.session_state.get("vectordb", None)
if not vectordb:
with st.chat_message("assistant"):
st.write("You need to upload a PDF first.")
st.stop()
# Search the vector database for similar content to the user's question
search_results = vectordb.similarity_search(question, k=3)
pdf_extract = "\n".join(
[
f"{result.page_content} (Filename: {result.metadata['filename']}, Page: {result.metadata['page']})"
for result in search_results
]
)
# Use the QA pipeline with the context
response = qa_pipeline(question=question, context=pdf_extract)
# Update the assistant's response
with st.chat_message("assistant"):
st.write(response["answer"])
# Update the session state prompt
st.session_state["prompt"].append({"role": "user", "content": question})
st.session_state["prompt"].append({"role": "assistant", "content": response["answer"]})