Spaces:

ashok2216
/

pdf-chatbot

Sleeping

App Files Files Community

ashok2216 commited on Nov 19, 2024

Commit

eeb3be6

verified ·

1 Parent(s): ed310d6

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -81

app.py CHANGED Viewed

@@ -1,88 +1,103 @@
-import chromadb
-from chromadb.utils import embedding_functions
-from sentence_transformers import SentenceTransformer
-from transformers import pipeline
 import streamlit as st
-import fitz  # PyMuPDF for PDF parsing
-# Step 1: Setup ChromaDB
-def setup_chromadb():
-    # Initialize ChromaDB in-memory instance
-    client = chromadb.Client()
-    try:
-        client.delete_collection("pdf_data")
-        print("Existing collection 'pdf_data' deleted.")
-    except:
-        print("Collection 'pdf_data' not found, creating a new one.")
-    # Create a new collection with the embedding function
-    ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    collection = client.create_collection("pdf_data", embedding_function=ef)
-    return client, collection
-# Step 2: Extract Text from PDF
-def extract_text_from_pdf(pdf_path):
-    pdf_text = ""
-    with fitz.open(pdf_path) as doc:
-        for page in doc:
-            pdf_text += page.get_text()
-    return pdf_text
-# Step 3: Add Extracted Text to Vector Database
-def add_pdf_text_to_db(collection, pdf_text):
-    sentences = pdf_text.split("\n")  # Split text into lines for granularity
-    for idx, sentence in enumerate(sentences):
-        if sentence.strip():  # Avoid empty lines
-            collection.add(
-                ids=[f"pdf_text_{idx}"],
-                documents=[sentence],
-                metadatas={"line_number": idx, "text": sentence}
-            )
-# Step 4: Query Function
-def query_pdf_data(collection, query, retriever_model):
-    results = collection.query(
-        query_texts=[query],
-        n_results=3
-    )
-    context = " ".join([doc for doc in results["documents"][0]])
-    answer = retriever_model(f"Context: {context}\nQuestion: {query}")
-    return answer, results["metadatas"]
-# Streamlit Interface
-def main():
-    st.title("PDF Chatbot with Retrieval-Augmented Generation")
-    st.write("Upload a PDF, and ask questions about its content!")
-    # Initialize components
-    client, collection = setup_chromadb()
-    retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
-    # File upload
-    uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
-    if uploaded_file:
-        st.write("Extracting text and populating the database...")
-        pdf_text = extract_text_from_pdf(uploaded_file)
-        add_pdf_text_to_db(collection, pdf_text)
-        st.success("PDF text has been added to the database. You can now query it!")
-        # Query Input
-        query = st.text_input("Enter your query about the PDF:")
-        if query:
-            try:
-                answer, metadata = query_pdf_data(collection, query, retriever_model)
-                st.subheader("Answer:")
-                st.write(answer[0]['generated_text'])
-                st.subheader("Retrieved Context:")
-                for meta in metadata[0]:
-                    st.write(meta)
-            except Exception as e:
-                st.error(f"An error occurred: {str(e)}")
-if __name__ == "__main__":
-    main()

+# import chromadb
+# from chromadb.utils import embedding_functions
+# from sentence_transformers import SentenceTransformer
+# from transformers import pipeline
+# import streamlit as st
+# import fitz  # PyMuPDF for PDF parsing
+# # Step 1: Setup ChromaDB
+# def setup_chromadb():
+#     # Initialize ChromaDB in-memory instance
+#     client = chromadb.Client()
+#     try:
+#         client.delete_collection("pdf_data")
+#         print("Existing collection 'pdf_data' deleted.")
+#     except:
+#         print("Collection 'pdf_data' not found, creating a new one.")
+#     # Create a new collection with the embedding function
+#     ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
+#     collection = client.create_collection("pdf_data", embedding_function=ef)
+#     return client, collection
+# # Step 2: Extract Text from PDF
+# def extract_text_from_pdf(pdf_path):
+#     pdf_text = ""
+#     with fitz.open(pdf_path) as doc:
+#         for page in doc:
+#             pdf_text += page.get_text()
+#     return pdf_text
+# # Step 3: Add Extracted Text to Vector Database
+# def add_pdf_text_to_db(collection, pdf_text):
+#     sentences = pdf_text.split("\n")  # Split text into lines for granularity
+#     for idx, sentence in enumerate(sentences):
+#         if sentence.strip():  # Avoid empty lines
+#             collection.add(
+#                 ids=[f"pdf_text_{idx}"],
+#                 documents=[sentence],
+#                 metadatas={"line_number": idx, "text": sentence}
+#             )
+# # Step 4: Query Function
+# def query_pdf_data(collection, query, retriever_model):
+#     results = collection.query(
+#         query_texts=[query],
+#         n_results=3
+#     )
+#     context = " ".join([doc for doc in results["documents"][0]])
+#     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
+#     return answer, results["metadatas"]
+# # Streamlit Interface
+# def main():
+#     st.title("PDF Chatbot with Retrieval-Augmented Generation")
+#     st.write("Upload a PDF, and ask questions about its content!")
+#     # Initialize components
+#     client, collection = setup_chromadb()
+#     retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM
+#     # File upload
+#     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
+#     if uploaded_file:
+#         st.write("Extracting text and populating the database...")
+#         pdf_text = extract_text_from_pdf(uploaded_file)
+#         add_pdf_text_to_db(collection, pdf_text)
+#         st.success("PDF text has been added to the database. You can now query it!")
+#         # Query Input
+#         query = st.text_input("Enter your query about the PDF:")
+#         if query:
+#             try:
+#                 answer, metadata = query_pdf_data(collection, query, retriever_model)
+#                 st.subheader("Answer:")
+#                 st.write(answer[0]['generated_text'])
+#                 st.subheader("Retrieved Context:")
+#                 for meta in metadata[0]:
+#                     st.write(meta)
+#             except Exception as e:
+#                 st.error(f"An error occurred: {str(e)}")
+# if __name__ == "__main__":
+#     main()
 import streamlit as st
+from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection
+configuration = {
+    "client": "PersistentClient",
+    "path": "/tmp/.chroma"
+}
+collection_name = "documents_collection"
+conn = st.connection("chromadb",
+                     type=ChromaDBConnection,
+                     **configuration)
+documents_collection_df = conn.get_collection_data(collection_name)
+st.dataframe(documents_collection_df)