Spaces:
Sleeping
Sleeping
# import chromadb | |
# from chromadb.utils import embedding_functions | |
# from chromadb.config import Settings | |
# from sentence_transformers import SentenceTransformer | |
# from transformers import pipeline | |
# import streamlit as st | |
# import fitz # PyMuPDF for PDF parsing | |
# # Configure ChromaDB with persistent SQLite database | |
# config = Settings( | |
# persist_directory="./chromadb_data", | |
# chroma_db_impl="sqlite", | |
# ) | |
# # Initialize persistent client with SQLite | |
# def setup_chromadb(): | |
# client = chromadb.PersistentClient(path="./chromadb_data") | |
# collection = client.get_or_create_collection( | |
# name="pdf_data", | |
# embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( | |
# model_name="sentence-transformers/all-MiniLM-L6-v2" | |
# ), | |
# ) | |
# return client, collection | |
# def extract_text_from_pdf(uploaded_file): | |
# with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: | |
# text = "" | |
# for page in doc: | |
# text += page.get_text() | |
# return text | |
# def add_pdf_text_to_db(collection, pdf_text): | |
# sentences = pdf_text.split("\n") # Split text into lines for granularity | |
# for idx, sentence in enumerate(sentences): | |
# if sentence.strip(): # Avoid empty lines | |
# collection.add( | |
# ids=[f"pdf_text_{idx}"], | |
# documents=[sentence], | |
# metadatas={"line_number": idx, "text": sentence} | |
# ) | |
# def query_pdf_data(collection, query, retriever_model): | |
# results = collection.query( | |
# query_texts=[query], | |
# n_results=3 | |
# ) | |
# context = " ".join([doc for doc in results["documents"][0]]) | |
# answer = retriever_model(f"Context: {context}\nQuestion: {query}") | |
# return answer, results["metadatas"] | |
# # Streamlit Interface | |
# def main(): | |
# st.title("PDF Chatbot with Retrieval-Augmented Generation") | |
# st.write("Upload a PDF, and ask questions about its content!") | |
# # Initialize components | |
# client, collection = setup_chromadb() | |
# retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM | |
# # File upload | |
# uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") | |
# if uploaded_file: | |
# try: | |
# pdf_text = extract_text_from_pdf(uploaded_file) | |
# st.success("Text extracted successfully!") | |
# st.text_area("Extracted Text:", pdf_text, height=300) | |
# add_pdf_text_to_db(collection, pdf_text) | |
# st.success("PDF text has been added to the database. You can now query it!") | |
# except Exception as e: | |
# st.error(f"Error extracting text: {e}") | |
# query = st.text_input("Enter your query about the PDF:") | |
# if query: | |
# try: | |
# answer, metadata = query_pdf_data(collection, query, retriever_model) | |
# st.subheader("Answer:") | |
# st.write(answer[0]['generated_text']) | |
# st.subheader("Retrieved Context:") | |
# for meta in metadata[0]: | |
# st.write(meta) | |
# except Exception as e: | |
# st.error(f"An error occurred: {str(e)}") | |
# if __name__ == "__main__": | |
# main() | |
import chromadb | |
from chromadb.utils import embedding_functions | |
from chromadb.config import Settings | |
from sentence_transformers import SentenceTransformer | |
from transformers import pipeline | |
import streamlit as st | |
import fitz # PyMuPDF for PDF parsing | |
# Configure ChromaDB with persistent SQLite database | |
config = Settings( | |
persist_directory="./chromadb_data", | |
chroma_db_impl="sqlite", | |
) | |
# Initialize persistent client with SQLite | |
def setup_chromadb(): | |
client = chromadb.PersistentClient(path="./chromadb_data") | |
collection = client.get_or_create_collection( | |
name="pdf_data", | |
embedding_function=chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction( | |
model_name="sentence-transformers/all-MiniLM-L6-v2" | |
), | |
) | |
return client, collection | |
# Clear the collection | |
def clear_collection(collection): | |
collection.delete(where={}) # Delete all entries in the collection | |
def extract_text_from_pdf(uploaded_file): | |
with fitz.open(stream=uploaded_file.read(), filetype="pdf") as doc: | |
text = "" | |
for page in doc: | |
text += page.get_text() | |
return text | |
def add_pdf_text_to_db(collection, pdf_text): | |
sentences = pdf_text.split("\n") # Split text into lines for granularity | |
for idx, sentence in enumerate(sentences): | |
if sentence.strip(): # Avoid empty lines | |
collection.add( | |
ids=[f"pdf_text_{idx}"], | |
documents=[sentence], | |
metadatas={"line_number": idx, "text": sentence} | |
) | |
def query_pdf_data(collection, query, retriever_model): | |
results = collection.query( | |
query_texts=[query], | |
n_results=3 | |
) | |
context = " ".join([doc for doc in results["documents"][0]]) | |
answer = retriever_model(f"Context: {context}\nQuestion: {query}") | |
return answer, results["metadatas"] | |
# Streamlit Interface | |
def main(): | |
st.title("PDF Chatbot with Retrieval-Augmented Generation") | |
st.write("Upload a PDF, and ask questions about its content!") | |
# Initialize components | |
client, collection = setup_chromadb() | |
retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM | |
# File upload | |
uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") | |
if uploaded_file: | |
try: | |
# Clear existing data | |
clear_collection(collection) | |
st.info("Existing data cleared from the database.") | |
# Extract and add new data | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
st.success("Text extracted successfully!") | |
st.text_area("Extracted Text:", pdf_text, height=300) | |
add_pdf_text_to_db(collection, pdf_text) | |
st.success("PDF text has been added to the database. You can now query it!") | |
except Exception as e: | |
st.error(f"Error extracting text: {e}") | |
query = st.text_input("Enter your query about the PDF:") | |
if query: | |
try: | |
answer, metadata = query_pdf_data(collection, query, retriever_model) | |
st.subheader("Answer:") | |
st.write(answer[0]['generated_text']) | |
st.subheader("Retrieved Context:") | |
for meta in metadata[0]: | |
st.write(meta) | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
if __name__ == "__main__": | |
main() | |