Spaces:

kpawargi
/

PDF_Query_Chatbot

Sleeping

App Files Files Community

PDF_Query_Chatbot / app.py

kpawargi

Update app.py

a310bdb verified about 1 month ago

raw

history blame

3.75 kB

	import streamlit as st
	from PyPDF2 import PdfReader
	from langchain.vectorstores.cassandra import Cassandra
	from langchain.indexes.vectorstore import VectorStoreIndexWrapper
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.llms import HuggingFaceHub
	from langchain.text_splitter import CharacterTextSplitter
	import cassio
	from dotenv import load_dotenv
	import os

	load_dotenv()

	ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
	ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
	HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

	# === Streamlit UI Setup ===
	st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
	st.title("📄💬 Query PDF using LangChain + AstraDB (Free Hugging Face Models)")

	# === File Upload ===
	uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])

	if uploaded_file:
	st.success("✅ PDF uploaded successfully!")
	process_button = st.button("🔄 Process PDF")

	if process_button:
	# Initialize AstraDB
	cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

	# Read PDF contents
	pdf_reader = PdfReader(uploaded_file)
	raw_text = ""
	for page in pdf_reader.pages:
	content = page.extract_text()
	if content:
	raw_text += content

	# Split text into chunks
	text_splitter = CharacterTextSplitter(
	separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
	)
	texts = text_splitter.split_text(raw_text)

	# === Embeddings ===
	embedding = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2"
	)

	# === Hugging Face LLM ===
	llm = HuggingFaceHub(
	repo_id="mistralai/Mistral-7B-Instruct-v0.1",
	huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
	model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
	)

	# === Create vector store and index ===
	vector_store = Cassandra(
	embedding=embedding,
	table_name="qa_mini_demo",
	session=None,
	keyspace=None,
	)
	vector_store.add_texts(texts[:50])
	st.success(f"📚 {len(texts[:50])} chunks embedded and stored in AstraDB.")

	astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)

	# === Ask Questions ===
	st.header("🤖 Ask a question about your PDF")
	user_question = st.text_input("💬 Type your question here")

	if user_question:
	with st.spinner("🧠 Thinking..."):
	try:
	# Optional: show what documents are retrieved before sending to LLM
	retrieved_docs = vector_store.similarity_search(user_question, k=4)
	if not retrieved_docs:
	st.warning("⚠️ No relevant text chunks found for this question. Try a different question.")
	else:
	st.markdown("### 🔍 Top Relevant Chunks (raw):")
	for i, doc in enumerate(retrieved_docs, 1):
	st.code(doc.page_content[:300], language="markdown")


	answer = astra_vector_index.query(user_question, llm=llm)
	if answer.strip():
	st.markdown("### 🧠 Answer:")
	st.write(answer.strip())
	else:
	st.warning("⚠️ The model returned an empty response. Try rephrasing the question or check your model/API key.")
	except Exception as e:
	st.error(f"🚨 Error while generating response:\n\n{str(e)}")