import streamlit as st from PyPDF2 import PdfReader from langchain.vectorstores.cassandra import Cassandra from langchain.indexes.vectorstore import VectorStoreIndexWrapper from langchain.embeddings import HuggingFaceEmbeddings from langchain.llms import HuggingFaceHub from langchain.text_splitter import CharacterTextSplitter import cassio from dotenv import load_dotenv import os load_dotenv() ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") # === Streamlit UI Setup === st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide") st.title("📄💬 Query PDF using LangChain + AstraDB (Free Hugging Face Models)") # === File Upload === uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) if uploaded_file: st.success("✅ PDF uploaded successfully!") process_button = st.button("🔄 Process PDF") if process_button: # Initialize AstraDB cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) # Read PDF contents pdf_reader = PdfReader(uploaded_file) raw_text = "" for page in pdf_reader.pages: content = page.extract_text() if content: raw_text += content # Split text into chunks text_splitter = CharacterTextSplitter( separator="\n", chunk_size=800, chunk_overlap=200, length_function=len ) texts = text_splitter.split_text(raw_text) # === Embeddings === embedding = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) # === Hugging Face LLM === llm = HuggingFaceHub( repo_id="mistralai/Mistral-7B-Instruct-v0.1", huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, model_kwargs={"temperature": 0.5, "max_new_tokens": 512} ) # === Create vector store and index === vector_store = Cassandra( embedding=embedding, table_name="qa_mini_demo", session=None, keyspace=None, ) vector_store.add_texts(texts[:50]) st.success(f"📚 {len(texts[:50])} chunks embedded and stored in AstraDB.") astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) # === Ask Questions === st.header("🤖 Ask a question about your PDF") user_question = st.text_input("💬 Type your question here") if user_question: with st.spinner("🧠 Thinking..."): try: # Optional: show what documents are retrieved before sending to LLM retrieved_docs = vector_store.similarity_search(user_question, k=4) if not retrieved_docs: st.warning("⚠️ No relevant text chunks found for this question. Try a different question.") else: st.markdown("### 🔍 Top Relevant Chunks (raw):") for i, doc in enumerate(retrieved_docs, 1): st.code(doc.page_content[:300], language="markdown") answer = astra_vector_index.query(user_question, llm=llm) if answer.strip(): st.markdown("### 🧠 Answer:") st.write(answer.strip()) else: st.warning("⚠️ The model returned an empty response. Try rephrasing the question or check your model/API key.") except Exception as e: st.error(f"🚨 Error while generating response:\n\n{str(e)}")