# import chromadb # from chromadb.utils import embedding_functions # from chromadb.config import Settings # from sentence_transformers import SentenceTransformer # from transformers import pipeline # import streamlit as st # import fitz # PyMuPDF for PDF parsing # # Configure ChromaDB with persistent SQLite database # config = Settings( # persist_directory="./chromadb_data", # chroma_db_impl="sqlite", # ) # # Initialize persistent client with SQLite # def setup_chromadb(): # client = chromadb.PersistentClient(path="./chromadb_data") # collection = client.get_or_create_collection( # name="pdf_data", # embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( # model_name="sentence-transformers/all-MiniLM-L6-v2" # ), # ) # return client, collection # def extract_text_from_pdf(uploaded_file): # with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: # text = "" # for page in doc: # text += page.get_text() # return text # def add_pdf_text_to_db(collection, pdf_text): # sentences = pdf_text.split("\n") # Split text into lines for granularity # for idx, sentence in enumerate(sentences): # if sentence.strip(): # Avoid empty lines # collection.add( # ids=[f"pdf_text_{idx}"], # documents=[sentence], # metadatas={"line_number": idx, "text": sentence} # ) # def query_pdf_data(collection, query, retriever_model): # results = collection.query( # query_texts=[query], # n_results=3 # ) # context = " ".join([doc for doc in results["documents"][0]]) # answer = retriever_model(f"Context: {context}\nQuestion: {query}") # return answer, results["metadatas"] # # Streamlit Interface # def main(): # st.title("PDF Chatbot with Retrieval-Augmented Generation") # st.write("Upload a PDF, and ask questions about its content!") # # Initialize components # client, collection = setup_chromadb() # retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM # # File upload # uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") # if uploaded_file: # try: # pdf_text = extract_text_from_pdf(uploaded_file) # st.success("Text extracted successfully!") # st.text_area("Extracted Text:", pdf_text, height=300) # add_pdf_text_to_db(collection, pdf_text) # st.success("PDF text has been added to the database. You can now query it!") # except Exception as e: # st.error(f"Error extracting text: {e}") # query = st.text_input("Enter your query about the PDF:") # if query: # try: # answer, metadata = query_pdf_data(collection, query, retriever_model) # st.subheader("Answer:") # st.write(answer[0]['generated_text']) # st.subheader("Retrieved Context:") # for meta in metadata[0]: # st.write(meta) # except Exception as e: # st.error(f"An error occurred: {str(e)}") # if __name__ == "__main__": # main() import chromadb from chromadb.utils import embedding_functions from chromadb.config import Settings from sentence_transformers import SentenceTransformer from transformers import pipeline import streamlit as st import fitz # PyMuPDF for PDF parsing # Configure ChromaDB with persistent SQLite database config = Settings( persist_directory="./chromadb_data", chroma_db_impl="sqlite", ) # Initialize persistent client with SQLite def setup_chromadb(): client = chromadb.PersistentClient(path="./chromadb_data") collection = client.get_or_create_collection( name="pdf_data", embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( model_name="sentence-transformers/all-MiniLM-L6-v2" ), ) return client, collection # Clear the collection def clear_collection(collection): collection.delete(where={}) # Delete all entries in the collection def extract_text_from_pdf(uploaded_file): with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: text = "" for page in doc: text += page.get_text() return text def add_pdf_text_to_db(collection, pdf_text): sentences = pdf_text.split("\n") # Split text into lines for granularity for idx, sentence in enumerate(sentences): if sentence.strip(): # Avoid empty lines collection.add( ids=[f"pdf_text_{idx}"], documents=[sentence], metadatas={"line_number": idx, "text": sentence} ) def query_pdf_data(collection, query, retriever_model): results = collection.query( query_texts=[query], n_results=3 ) context = " ".join([doc for doc in results["documents"][0]]) answer = retriever_model(f"Context: {context}\nQuestion: {query}") return answer, results["metadatas"] # Streamlit Interface def main(): st.title("PDF Chatbot with Retrieval-Augmented Generation") st.write("Upload a PDF, and ask questions about its content!") # Initialize components client, collection = setup_chromadb() retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM # File upload uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") if uploaded_file: try: # Clear existing data clear_collection(collection) st.info("Existing data cleared from the database.") # Extract and add new data pdf_text = extract_text_from_pdf(uploaded_file) st.success("Text extracted successfully!") st.text_area("Extracted Text:", pdf_text, height=300) add_pdf_text_to_db(collection, pdf_text) st.success("PDF text has been added to the database. You can now query it!") except Exception as e: st.error(f"Error extracting text: {e}") query = st.text_input("Enter your query about the PDF:") if query: try: answer, metadata = query_pdf_data(collection, query, retriever_model) st.subheader("Answer:") st.write(answer[0]['generated_text']) st.subheader("Retrieved Context:") for meta in metadata[0]: st.write(meta) except Exception as e: st.error(f"An error occurred: {str(e)}") if __name__ == "__main__": main()