Spaces:
Sleeping
Sleeping
File size: 3,567 Bytes
3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 3904554 2bc5b24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
import cassio
import os
import uuid
from dotenv import load_dotenv
# π Load secrets from environment (Hugging Face Spaces uses HF Secrets)
load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# π§ Initialize AstraDB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
# π¨ Streamlit UI Setup
st.set_page_config(page_title="Query PDF with LangChain", layout="wide")
st.title("ππ¬ Query PDF using LangChain + AstraDB (Hugging Face Models)")
# π PDF Upload
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file:
st.success("β
PDF uploaded successfully.")
process_button = st.button("π Process PDF")
if process_button:
# π§Ύ Read PDF
pdf_reader = PdfReader(uploaded_file)
raw_text = ""
for page in pdf_reader.pages:
content = page.extract_text()
if content:
raw_text += content
# βοΈ Split into Chunks
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_text(raw_text)
# π§ Embeddings
embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2")
# π€ LLM
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
)
# ποΈ Unique Table Name for Each PDF Upload
table_name = "qa_" + str(uuid.uuid4()).replace("-", "_")
# π¦ Vector Store Setup
vector_store = Cassandra(
embedding=embedding,
table_name=table_name,
session=None,
keyspace=None,
)
vector_store.add_texts(texts[:50])
st.success(f"π {len(texts[:50])} chunks embedded and stored in AstraDB.")
# π Setup Index
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
# π¬ Ask Questions
st.header("π€ Ask a question about your PDF")
user_question = st.text_input("π¬ Type your question here")
if user_question:
with st.spinner("π§ Thinking..."):
try:
# Retrieve relevant context (used internally, not displayed)
retrieved_docs = vector_store.similarity_search(user_question, k=8)
if not retrieved_docs:
st.warning("β οΈ No relevant text found. Try rephrasing your question.")
else:
answer = astra_vector_index.query(user_question, llm=llm)
if answer.strip():
st.markdown("### π§ Answer:")
st.write(answer.strip())
else:
st.warning("β οΈ Model returned an empty response.")
except Exception as e:
st.error(f"π¨ Error: {str(e)}")
|