Spaces:
Sleeping
Sleeping
File size: 3,149 Bytes
3904554 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
from PyPDF2 import PdfReader
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFaceHub
from langchain.text_splitter import CharacterTextSplitter
import cassio
from dotenv import load_dotenv
import os
load_dotenv()
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
# === Streamlit UI Setup ===
st.set_page_config(page_title="Query PDF with Free Hugging Face Models", layout="wide")
st.title("ππ¬ Query PDF using LangChain + AstraDB (Free Hugging Face Models)")
# === File Upload ===
uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
if uploaded_file:
st.success("β
PDF uploaded successfully!")
process_button = st.button("π Process PDF")
if process_button:
# Initialize AstraDB
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)
# Read PDF contents
pdf_reader = PdfReader(uploaded_file)
raw_text = ""
for page in pdf_reader.pages:
content = page.extract_text()
if content:
raw_text += content
# Split text into chunks
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=800, chunk_overlap=200, length_function=len
)
texts = text_splitter.split_text(raw_text)
# === Embeddings ===
embedding = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
# === Hugging Face LLM ===
llm = HuggingFaceHub(
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
model_kwargs={"temperature": 0.5, "max_new_tokens": 512}
)
# === Create vector store and index ===
vector_store = Cassandra(
embedding=embedding,
table_name="qa_mini_demo",
session=None,
keyspace=None,
)
vector_store.add_texts(texts[:50])
st.success(f"π {len(texts[:50])} chunks embedded and stored in AstraDB.")
astra_vector_index = VectorStoreIndexWrapper(vectorstore=vector_store)
# === Ask Questions ===
st.header("π€ Ask a question about your PDF")
user_question = st.text_input("π¬ Type your question here")
if user_question:
with st.spinner("Thinking..."):
answer = astra_vector_index.query(user_question, llm=llm).strip()
st.markdown(f"### π§ Answer:\n{answer}")
st.markdown("### π Top Relevant Chunks")
docs = vector_store.similarity_search_with_score(user_question, k=4)
for i, (doc, score) in enumerate(docs, 1):
st.markdown(f"**Chunk {i}** β Relevance Score: `{score:.4f}`")
st.code(doc.page_content[:500], language="markdown")
|