Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from langchain.vectorstores.cassandra import Cassandra | |
| from langchain.indexes.vectorstore import VectorStoreIndexWrapper | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.llms import HuggingFaceHub | |
| from langchain.text_splitter import CharacterTextSplitter | |
| import cassio | |
| import os | |
| import uuid | |
| from dotenv import load_dotenv | |
| # π Load secrets from environment (Hugging Face Spaces uses HF Secrets) | |
| load_dotenv() | |
| ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | |
| ASTRA_DB_ID = os.getenv("ASTRA_DB_ID") | |
| HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
| # π§ Initialize AstraDB | |
| cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID) | |
| # π¨ Streamlit UI Setup | |
| st.set_page_config(page_title="Query PDF with LangChain", layout="wide") | |
| st.title("ππ¬ Query PDF using LangChain + AstraDB (Hugging Face Models)") | |
| # π PDF Upload | |
| uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"]) | |
| if uploaded_file: | |
| st.success("β PDF uploaded successfully.") | |
| process_button = st.button("π Process PDF") | |
| if process_button: | |
| # π§Ύ Read PDF | |
| pdf_reader = PdfReader(uploaded_file) | |
| raw_text = "" | |
| for page in pdf_reader.pages: | |
| content = page.extract_text() | |
| if content: | |
| raw_text += content | |
| # βοΈ Split into Chunks | |
| text_splitter = CharacterTextSplitter( | |
| separator="\n", chunk_size=800, chunk_overlap=200, length_function=len | |
| ) | |
| texts = text_splitter.split_text(raw_text) | |
| # π§ Embeddings | |
| embedding = HuggingFaceEmbeddings(model_name="intfloat/e5-base-v2") | |
| # π€ LLM | |
| llm = HuggingFaceHub( | |
| repo_id="mistralai/Mistral-7B-Instruct-v0.1", | |
| huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN, | |
| model_kwargs={"temperature": 0.5, "max_new_tokens": 512} | |
| ) | |
| # ποΈ Unique Table Name for Each PDF Upload | |
| table_name = "qa_" + str(uuid.uuid4()).replace("-", "_") | |
| # π¦ Vector Store Setup | |
| vector_store = Cassandra( | |
| embedding=embedding, | |
| table_name=table_name, | |
| session=None, | |
| keyspace=None, | |
| ) | |
| vector_store.add_texts(texts[:50]) | |
| st.success(f"π {len(texts[:50])} chunks embedded and stored in AstraDB.") | |
| # π Setup Index | |
| astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store) | |
| # π¬ Ask Questions | |
| st.header("π€ Ask a question about your PDF") | |
| user_question = st.text_input("π¬ Type your question here") | |
| if user_question: | |
| with st.spinner("π§ Thinking..."): | |
| try: | |
| # Retrieve relevant context (used internally, not displayed) | |
| retrieved_docs = vector_store.similarity_search(user_question, k=8) | |
| if not retrieved_docs: | |
| st.warning("β οΈ No relevant text found. Try rephrasing your question.") | |
| else: | |
| answer = astra_vector_index.query(user_question, llm=llm) | |
| if answer.strip(): | |
| st.markdown("### π§ Answer:") | |
| st.write(answer.strip()) | |
| else: | |
| st.warning("β οΈ Model returned an empty response.") | |
| except Exception as e: | |
| st.error(f"π¨ Error: {str(e)}") | |