Spaces:
Sleeping
Sleeping
File size: 3,204 Bytes
7e364b6 48010b4 7e364b6 eeb3be6 0dcfd6e 7e364b6 1788a8d 7e364b6 1788a8d d051bce 7e364b6 d051bce 7e364b6 d051bce f4c2b4e b3e0053 f4c2b4e d051bce f4c2b4e 7e364b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import streamlit as st
import fitz # PyMuPDF for PDF parsing
# Configure ChromaDB with persistent SQLite database
config = Settings(
persist_directory="./chromadb_data",
chroma_db_impl="sqlite",
)
# Initialize persistent client with SQLite
def setup_chromadb():
client = chromadb.PersistentClient(path="./chromadb_data")
collection = client.get_or_create_collection(
name="pdf_data",
embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
model_name="sentence-transformers/all-MiniLM-L6-v2"
),
)
return client, collection
def extract_text_from_pdf(uploaded_file):
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
text = ""
for page in doc:
text += page.get_text()
return text
def add_pdf_text_to_db(collection, pdf_text):
sentences = pdf_text.split("\n") # Split text into lines for granularity
for idx, sentence in enumerate(sentences):
if sentence.strip(): # Avoid empty lines
collection.add(
ids=[f"pdf_text_{idx}"],
documents=[sentence],
metadatas={"line_number": idx, "text": sentence}
)
def query_pdf_data(collection, query, retriever_model):
results = collection.query(
query_texts=[query],
n_results=3
)
context = " ".join([doc for doc in results["documents"][0]])
answer = retriever_model(f"Context: {context}\nQuestion: {query}")
return answer, results["metadatas"]
# Streamlit Interface
def main():
st.title("PDF Chatbot with Retrieval-Augmented Generation")
st.write("Upload a PDF, and ask questions about its content!")
# Initialize components
client, collection = setup_chromadb()
retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
# File upload
uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
if uploaded_file:
try:
pdf_text = extract_text_from_pdf(uploaded_file)
st.success("Text extracted successfully!")
st.text_area("Extracted Text:", pdf_text, height=300)
add_pdf_text_to_db(collection, pdf_text)
st.success("PDF text has been added to the database. You can now query it!")
query = st.text_input("Enter your query about the PDF:")
if query:
try:
answer, metadata = query_pdf_data(collection, query, retriever_model)
st.subheader("Answer:")
st.write(answer[0]['generated_text'])
st.subheader("Retrieved Context:")
for meta in metadata[0]:
st.write(meta)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
except Exception as e:
st.error(f"Error extracting text: {e}")
if __name__ == "__main__":
main()
|