kpawargi's picture
Update app.py
3904554 verified
raw
history blame
3.15 kB
import streamlit as st
from PyPDF2 import PdfReader
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
import cassio
from dotenv import load_dotenv
import os
load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# === Streamlit UI Setup ===
st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
st.title("πŸ“„πŸ’¬ Query PDF using LangChain + AstraDB (Free Hugging Face Models)")
# === File Upload ===
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file:
st.success("βœ… PDF uploaded successfully!")
process_button = st.button("πŸ”„ Process PDF")
if process_button:
# Initialize AstraDB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
# Read PDF contents
pdf_reader = PdfReader(uploaded_file)
raw_text = ""
for page in pdf_reader.pages:
content = page.extract_text()
if content:
raw_text += content
# Split text into chunks
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_text(raw_text)
# === Embeddings ===
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# === Hugging Face LLM ===
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
)
# === Create vector store and index ===
vector_store = Cassandra(
embedding=embedding,
table_name="qa_mini_demo",
session=None,
keyspace=None,
)
vector_store.add_texts(texts[:50])
st.success(f"πŸ“š {len(texts[:50])} chunks embedded and stored in AstraDB.")
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
# === Ask Questions ===
st.header("πŸ€– Ask a question about your PDF")
user_question = st.text_input("πŸ’¬ Type your question here")
if user_question:
with st.spinner("Thinking..."):
answer = astra_vector_index.query(user_question, llm=llm).strip()
st.markdown(f"### 🧠 Answer:\n{answer}")
st.markdown("### πŸ” Top Relevant Chunks")
docs = vector_store.similarity_search_with_score(user_question, k=4)
for i, (doc, score) in enumerate(docs, 1):
st.markdown(f"**Chunk {i}** β€” Relevance Score: `{score:.4f}`")
st.code(doc.page_content[:500], language="markdown")