File size: 5,361 Bytes
eeb3be6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7097291
eeb3be6
0dcfd6e
eeb3be6
 
 
 
0dcfd6e
eeb3be6
0dcfd6e
eeb3be6
db3e23f
eeb3be6
 
 
0dcfd6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# import chromadb
# from chromadb.utils import embedding_functions
# from sentence_transformers import SentenceTransformer
# from transformers import pipeline
# import streamlit as st
# import fitz  # PyMuPDF for PDF parsing

# # Step 1: Setup ChromaDB
# def setup_chromadb():
#     # Initialize ChromaDB in-memory instance
#     client = chromadb.Client()
#     try:
#         client.delete_collection("pdf_data")
#         print("Existing collection 'pdf_data' deleted.")
#     except:
#         print("Collection 'pdf_data' not found, creating a new one.")
#     # Create a new collection with the embedding function
#     ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     collection = client.create_collection("pdf_data", embedding_function=ef)
#     return client, collection

# # Step 2: Extract Text from PDF
# def extract_text_from_pdf(pdf_path):
#     pdf_text = ""
#     with fitz.open(pdf_path) as doc:
#         for page in doc:
#             pdf_text += page.get_text()
#     return pdf_text

# # Step 3: Add Extracted Text to Vector Database
# def add_pdf_text_to_db(collection, pdf_text):
#     sentences = pdf_text.split("\n")  # Split text into lines for granularity
#     for idx, sentence in enumerate(sentences):
#         if sentence.strip():  # Avoid empty lines
#             collection.add(
#                 ids=[f"pdf_text_{idx}"],
#                 documents=[sentence],
#                 metadatas={"line_number": idx, "text": sentence}
#             )

# # Step 4: Query Function
# def query_pdf_data(collection, query, retriever_model):
#     results = collection.query(
#         query_texts=[query],
#         n_results=3
#     )
#     context = " ".join([doc for doc in results["documents"][0]])
#     answer = retriever_model(f"Context: {context}\nQuestion: {query}")
#     return answer, results["metadatas"]

# # Streamlit Interface
# def main():
#     st.title("PDF Chatbot with Retrieval-Augmented Generation")
#     st.write("Upload a PDF, and ask questions about its content!")

#     # Initialize components
#     client, collection = setup_chromadb()
#     retriever_model = pipeline("text2text-generation", model="google/flan-t5-small")  # Free LLM

#     # File upload
#     uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
#     if uploaded_file:
#         st.write("Extracting text and populating the database...")
#         pdf_text = extract_text_from_pdf(uploaded_file)
#         add_pdf_text_to_db(collection, pdf_text)
#         st.success("PDF text has been added to the database. You can now query it!")

#         # Query Input
#         query = st.text_input("Enter your query about the PDF:")
#         if query:
#             try:
#                 answer, metadata = query_pdf_data(collection, query, retriever_model)
#                 st.subheader("Answer:")
#                 st.write(answer[0]['generated_text'])
#                 st.subheader("Retrieved Context:")
#                 for meta in metadata[0]:
#                     st.write(meta)
#             except Exception as e:
#                 st.error(f"An error occurred: {str(e)}")

# if __name__ == "__main__":
#     main()



import streamlit as st
from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection

configuration = {
    "client": "PersistentClient",
    "path": "/tmp/.chroma"
}

collection_name = "documents_collection"

conn = st.connection("chromadb",
                     type=ChromadbConnection,
                     **configuration)
documents_collection_df = conn.get_collection_data(collection_name)
st.dataframe(documents_collection_df)







# import tempfile
# import PyPDF2
# import streamlit as st
# from transformers import GPT2LMHeadModel, GPT2Tokenizer

# # Load pre-trained GPT-3 model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
# model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")

        
# def extract_text_from_pdf(file_path):
#     text = ""
#     with open(file_path, "rb") as f:
#         reader = PyPDF2.PdfFileReader(f)
#         for page_num in range(reader.numPages):
#             text += reader.getPage(page_num).extractText()
#     return text

# def generate_response(user_input):
#     input_ids = tokenizer.encode(user_input, return_tensors="pt")
#     output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
#     response = tokenizer.decode(output[0], skip_special_tokens=True)
#     return response

# def main():
#     st.title("PDF Chatbot")

#     pdf_file = st.file_uploader("Upload an pdf file", type=["pdf"], accept_multiple_files=False)

#     if pdf_file is not None:
#         with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
#             tmp_file.write(pdf_file.read())
#             st.success("PDF file successfully uploaded and stored temporally.")
#         file_path = tmp_file.name
#         pdf_text = extract_text_from_pdf(file_path)
#         st.text_area("PDF Content", pdf_text)
#     else:
#         st.markdown('File not found!')        

#     user_input = st.text_input("You:", "")
#     if st.button("Send"):
#         response = generate_response(user_input)
#         st.text_area("Chatbot:", response)

# if __name__ == "__main__":
#     main()