Spaces:

ashok2216
/

pdf-chatbot

Sleeping

File size: 3,204 Bytes

7e364b6
 
48010b4
7e364b6
 
 
 
eeb3be6
0dcfd6e
7e364b6
 
 
 
 
 
 
1788a8d
7e364b6
1788a8d
 
 
 
 
 
 
 
 
d051bce
 
 
7e364b6
d051bce
 
7e364b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d051bce
 
 
 
f4c2b4e
 
b3e0053
f4c2b4e
 
 
 
 
 
 
 
 
 
d051bce
 
f4c2b4e
 
7e364b6

import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import streamlit as st
import fitz  # PyMuPDF for PDF parsing



# Configure ChromaDB with persistent SQLite database
config = Settings(
    persist_directory="./chromadb_data",
    chroma_db_impl="sqlite",
)

# Initialize persistent client with SQLite
def setup_chromadb():
    client = chromadb.PersistentClient(path="./chromadb_data")
    collection = client.get_or_create_collection(
        name="pdf_data",
        embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        ),
    )
    return client, collection

def extract_text_from_pdf(uploaded_file):
    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
        text = ""
        for page in doc:
            text += page.get_text()
        return text

def add_pdf_text_to_db(collection, pdf_text):
    sentences = pdf_text.split("\n")  # Split text into lines for granularity
    for idx, sentence in enumerate(sentences):
        if sentence.strip():  # Avoid empty lines
            collection.add(
                ids=[f"pdf_text_{idx}"],
                documents=[sentence],
                metadatas={"line_number": idx, "text": sentence}
            )

def query_pdf_data(collection, query, retriever_model):
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    context = " ".join([doc for doc in results["documents"][0]])
    answer = retriever_model(f"Context: {context}\nQuestion: {query}")
    return answer, results["metadatas"]

# Streamlit Interface
def main():
    st.title("PDF Chatbot with Retrieval-Augmented Generation")
    st.write("Upload a PDF, and ask questions about its content!")

    # Initialize components
    client, collection = setup_chromadb()
    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM

    # File upload
    uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
    if uploaded_file:
        try:
            pdf_text = extract_text_from_pdf(uploaded_file)
            st.success("Text extracted successfully!")
            st.text_area("Extracted Text:", pdf_text, height=300)
            add_pdf_text_to_db(collection, pdf_text)
            st.success("PDF text has been added to the database. You can now query it!")
            query = st.text_input("Enter your query about the PDF:")
            if query:
                try:
                    answer, metadata = query_pdf_data(collection, query, retriever_model)
                    st.subheader("Answer:")
                    st.write(answer[0]['generated_text'])
                    st.subheader("Retrieved Context:")
                    for meta in metadata[0]:
                        st.write(meta)
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
        except Exception as e:
            st.error(f"Error extracting text: {e}")


if __name__ == "__main__":
    main()