Spaces:
Sleeping
Sleeping
import streamlit as st | |
from PyPDF2 import PdfReader | |
from langchain.vectorstores.cassandra import Cassandra | |
from langchain.indexes.vectorstore import VectorStoreIndexWrapper | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.llms import HuggingFaceHub | |
from langchain.text_splitter import CharacterTextSplitter | |
import cassio | |
import os | |
import uuid | |
from dotenv import load_dotenv | |
# π Load secrets from environment (Hugging Face Spaces uses HF Secrets) | |
load_dotenv() | |
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | |
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") | |
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
# π§ Initialize AstraDB | |
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) | |
# π¨ Streamlit UI Setup | |
st.set_page_config(page_title="Query PDF with LangChain", layout="wide") | |
st.title("ππ¬ Query PDF using LangChain + AstraDB (Hugging Face Models)") | |
# π PDF Upload | |
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) | |
if uploaded_file: | |
st.success("β PDF uploaded successfully.") | |
process_button = st.button("π Process PDF") | |
if process_button: | |
# π§Ύ Read PDF | |
pdf_reader = PdfReader(uploaded_file) | |
raw_text = "" | |
for page in pdf_reader.pages: | |
content = page.extract_text() | |
if content: | |
raw_text += content | |
# βοΈ Split into Chunks | |
text_splitter = CharacterTextSplitter( | |
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len | |
) | |
texts = text_splitter.split_text(raw_text) | |
# π§ Embeddings | |
embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2") | |
# π€ LLM | |
llm = HuggingFaceHub( | |
repo_id="mistralai/Mistral-7B-Instruct-v0.1", | |
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, | |
model_kwargs={"temperature": 0.5, "max_new_tokens": 512} | |
) | |
# ποΈ Unique Table Name for Each PDF Upload | |
table_name = "qa_" + str(uuid.uuid4()).replace("-", "_") | |
# π¦ Vector Store Setup | |
vector_store = Cassandra( | |
embedding=embedding, | |
table_name=table_name, | |
session=None, | |
keyspace=None, | |
) | |
vector_store.add_texts(texts[:50]) | |
st.success(f"π {len(texts[:50])} chunks embedded and stored in AstraDB.") | |
# π Setup Index | |
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) | |
# π¬ Ask Questions | |
st.header("π€ Ask a question about your PDF") | |
user_question = st.text_input("π¬ Type your question here") | |
if user_question: | |
with st.spinner("π§ Thinking..."): | |
try: | |
# Retrieve relevant context (used internally, not displayed) | |
retrieved_docs = vector_store.similarity_search(user_question, k=8) | |
if not retrieved_docs: | |
st.warning("β οΈ No relevant text found. Try rephrasing your question.") | |
else: | |
answer = astra_vector_index.query(user_question, llm=llm) | |
if answer.strip(): | |
st.markdown("### π§ Answer:") | |
st.write(answer.strip()) | |
else: | |
st.warning("β οΈ Model returned an empty response.") | |
except Exception as e: | |
st.error(f"π¨ Error: {str(e)}") | |