Spaces:

DrishtiSharma
/

docqa-with-deepseek-r1

Build error

File size: 6,698 Bytes

8a82b65

import os
import requests
import streamlit as st
import pickle
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.document_loaders import PDFPlumberLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from prompts import rag_prompt, relevancy_prompt, relevant_context_picker_prompt, response_synth

# Set API Keys
os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "")

# Load LLM models
llm_judge = ChatGroq(model="deepseek-r1-distill-llama-70b")
rag_llm = ChatGroq(model="mixtral-8x7b-32768")

llm_judge.verbose = True
rag_llm.verbose = True

VECTOR_DB_PATH = "/tmp/chroma_db"  
CHUNKS_FILE = "/tmp/chunks.pkl"  

# Session State Initialization
if "vector_store" not in st.session_state:
    st.session_state.vector_store = None
if "documents" not in st.session_state:
    st.session_state.documents = None
if "pdf_path" not in st.session_state:
    st.session_state.pdf_path = None  
if "pdf_loaded" not in st.session_state:
    st.session_state.pdf_loaded = False
if "chunked" not in st.session_state:
    st.session_state.chunked = False
if "vector_created" not in st.session_state:
    st.session_state.vector_created = False

st.title("Blah-2")

# Step 1: Choose PDF Source
pdf_source = st.radio("Upload or provide a link to a PDF:", ["Upload a PDF file", "Enter a PDF URL"], index=0, horizontal=True)

if pdf_source == "Upload a PDF file":
    uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
    if uploaded_file:
        st.session_state.pdf_path = "temp.pdf"
        with open(st.session_state.pdf_path, "wb") as f:
            f.write(uploaded_file.getbuffer())
        st.session_state.pdf_loaded = False
        st.session_state.chunked = False
        st.session_state.vector_created = False

elif pdf_source == "Enter a PDF URL":
    pdf_url = st.text_input("Enter PDF URL:")
    if pdf_url and not st.session_state.pdf_path:
        with st.spinner("Downloading PDF..."):
            try:
                response = requests.get(pdf_url)
                if response.status_code == 200:
                    st.session_state.pdf_path = "temp.pdf"
                    with open(st.session_state.pdf_path, "wb") as f:
                        f.write(response.content)
                    st.session_state.pdf_loaded = False
                    st.session_state.chunked = False
                    st.session_state.vector_created = False
                    st.success("✅ PDF Downloaded Successfully!")
                else:
                    st.error("❌ Failed to download PDF. Check the URL.")
            except Exception as e:
                st.error(f"❌ Error downloading PDF: {e}")

# Step 2: Load & Process PDF (Only Once)
if st.session_state.pdf_path and not st.session_state.pdf_loaded:
    with st.spinner("Loading PDF..."):
        try:
            loader = PDFPlumberLoader(st.session_state.pdf_path)
            docs = loader.load()
            st.session_state.documents = docs
            st.session_state.pdf_loaded = True
            st.success(f"✅ **PDF Loaded!** Total Pages: {len(docs)}")
        except Exception as e:
            st.error(f"❌ Error processing PDF: {e}")

# Load Cached Chunks if Available
def load_chunks():
    if os.path.exists(CHUNKS_FILE):
        with open(CHUNKS_FILE, "rb") as f:
            return pickle.load(f)
    return None

if not st.session_state.chunked:  # Ensure chunking only happens once
    cached_chunks = load_chunks()
    if cached_chunks:
        st.session_state.documents = cached_chunks
        st.session_state.chunked = True

# Step 3: Chunking (Only Happens Once)
if st.session_state.pdf_loaded and not st.session_state.chunked:
    with st.spinner("Chunking the document..."):
        try:
            model_name = "nomic-ai/modernbert-embed-base"
            embedding_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': 'cpu'})
            text_splitter = SemanticChunker(embedding_model)
            
            if st.session_state.documents:
                documents = text_splitter.split_documents(st.session_state.documents)
                st.session_state.documents = documents
                st.session_state.chunked = True

                # Save chunks for persistence
                with open(CHUNKS_FILE, "wb") as f:
                    pickle.dump(documents, f)

                st.success(f"✅ **Document Chunked!** Total Chunks: {len(documents)}")
        except Exception as e:
            st.error(f"❌ Error chunking document: {e}")

# Step 4: Setup Vectorstore 
def load_vector_store():
    return Chroma(persist_directory=VECTOR_DB_PATH, collection_name="deepseek_collection", embedding_function=HuggingFaceEmbeddings(model_name="nomic-ai/modernbert-embed-base"))

if st.session_state.chunked and not st.session_state.vector_created:
    with st.spinner("Creating vector store..."):
        try:
            if st.session_state.vector_store is None:  # Prevent unnecessary reloading
                st.session_state.vector_store = load_vector_store()

            if len(st.session_state.vector_store.get()["documents"]) == 0:  # Prevent duplicate insertions
                st.session_state.vector_store.add_documents(st.session_state.documents)

            num_documents = len(st.session_state.vector_store.get()["documents"])
            st.session_state.vector_created = True
            st.success(f"✅ **Vector Store Created!** Total documents stored: {num_documents}")
        except Exception as e:
            st.error(f"❌ Error creating vector store: {e}")

# Debugging Logs
st.write("📄 **PDF Loaded:**", st.session_state.pdf_loaded)
st.write("🔹 **Chunked:**", st.session_state.chunked)
st.write("📂 **Vector Store Created:**", st.session_state.vector_created)


# ----------------- Query Input -----------------
query = st.text_input("🔍 Ask a question about the document:")
if query:
    with st.spinner("🔄 Retrieving relevant context..."):
        retriever = st.session_state.vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
        contexts = retriever.invoke(query)
        # Debugging: Check what was retrieved
        st.write("Retrieved Contexts:", contexts)
        st.write("Number of Contexts:", len(contexts))
        
        context = [d.page_content for d in contexts]
        # Debugging: Check extracted context
        st.write("Extracted Context (page_content):", context)
        st.write("Number of Extracted Contexts:", len(context))


        ------