File size: 3,204 Bytes
7e364b6
 
48010b4
7e364b6
 
 
 
eeb3be6
0dcfd6e
7e364b6
 
 
 
 
 
 
1788a8d
7e364b6
1788a8d
 
 
 
 
 
 
 
 
d051bce
 
 
7e364b6
d051bce
 
7e364b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d051bce
 
 
 
f4c2b4e
 
b3e0053
f4c2b4e
 
 
 
 
 
 
 
 
 
d051bce
 
f4c2b4e
 
7e364b6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import chromadb
from chromadb.utils import embedding_functions
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import streamlit as st
import fitz  # PyMuPDF for PDF parsing



# Configure ChromaDB with persistent SQLite database
config = Settings(
    persist_directory="./chromadb_data",
    chroma_db_impl="sqlite",
)

# Initialize persistent client with SQLite
def setup_chromadb():
    client = chromadb.PersistentClient(path="./chromadb_data")
    collection = client.get_or_create_collection(
        name="pdf_data",
        embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
            model_name="sentence-transformers/all-MiniLM-L6-v2"
        ),
    )
    return client, collection

def extract_text_from_pdf(uploaded_file):
    with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc:
        text = ""
        for page in doc:
            text += page.get_text()
        return text

def add_pdf_text_to_db(collection, pdf_text):
    sentences = pdf_text.split("\n")  # Split text into lines for granularity
    for idx, sentence in enumerate(sentences):
        if sentence.strip():  # Avoid empty lines
            collection.add(
                ids=[f"pdf_text_{idx}"],
                documents=[sentence],
                metadatas={"line_number": idx, "text": sentence}
            )

def query_pdf_data(collection, query, retriever_model):
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    context = " ".join([doc for doc in results["documents"][0]])
    answer = retriever_model(f"Context: {context}\nQuestion: {query}")
    return answer, results["metadatas"]

# Streamlit Interface
def main():
    st.title("PDF Chatbot with Retrieval-Augmented Generation")
    st.write("Upload a PDF, and ask questions about its content!")

    # Initialize components
    client, collection = setup_chromadb()
    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM

    # File upload
    uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
    if uploaded_file:
        try:
            pdf_text = extract_text_from_pdf(uploaded_file)
            st.success("Text extracted successfully!")
            st.text_area("Extracted Text:", pdf_text, height=300)
            add_pdf_text_to_db(collection, pdf_text)
            st.success("PDF text has been added to the database. You can now query it!")
            query = st.text_input("Enter your query about the PDF:")
            if query:
                try:
                    answer, metadata = query_pdf_data(collection, query, retriever_model)
                    st.subheader("Answer:")
                    st.write(answer[0]['generated_text'])
                    st.subheader("Retrieved Context:")
                    for meta in metadata[0]:
                        st.write(meta)
                except Exception as e:
                    st.error(f"An error occurred: {str(e)}")
        except Exception as e:
            st.error(f"Error extracting text: {e}")


if __name__ == "__main__":
    main()