kpawargi's picture
Update app.py
a310bdb verified
raw
history blame
3.75 kB
import streamlit as st
from PyPDF2 import PdfReader
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
import cassio
from dotenv import load_dotenv
import os
load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# === Streamlit UI Setup ===
st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
st.title("πŸ“„πŸ’¬ Query PDF using LangChain + AstraDB (Free Hugging Face Models)")
# === File Upload ===
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file:
st.success("βœ… PDF uploaded successfully!")
process_button = st.button("πŸ”„ Process PDF")
if process_button:
# Initialize AstraDB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
# Read PDF contents
pdf_reader = PdfReader(uploaded_file)
raw_text = ""
for page in pdf_reader.pages:
content = page.extract_text()
if content:
raw_text += content
# Split text into chunks
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_text(raw_text)
# === Embeddings ===
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# === Hugging Face LLM ===
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
)
# === Create vector store and index ===
vector_store = Cassandra(
embedding=embedding,
table_name="qa_mini_demo",
session=None,
keyspace=None,
)
vector_store.add_texts(texts[:50])
st.success(f"πŸ“š {len(texts[:50])} chunks embedded and stored in AstraDB.")
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
# === Ask Questions ===
st.header("πŸ€– Ask a question about your PDF")
user_question = st.text_input("πŸ’¬ Type your question here")
if user_question:
with st.spinner("🧠 Thinking..."):
try:
# Optional: show what documents are retrieved before sending to LLM
retrieved_docs = vector_store.similarity_search(user_question, k=4)
if not retrieved_docs:
st.warning("⚠️ No relevant text chunks found for this question. Try a different question.")
else:
st.markdown("### πŸ” Top Relevant Chunks (raw):")
for i, doc in enumerate(retrieved_docs, 1):
st.code(doc.page_content[:300], language="markdown")
answer = astra_vector_index.query(user_question, llm=llm)
if answer.strip():
st.markdown("### 🧠 Answer:")
st.write(answer.strip())
else:
st.warning("⚠️ The model returned an empty response. Try rephrasing the question or check your model/API key.")
except Exception as e:
st.error(f"🚨 Error while generating response:\n\n{str(e)}")