ashok2216 commited on
Commit
eeb3be6
·
verified ·
1 Parent(s): ed310d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -81
app.py CHANGED
@@ -1,88 +1,103 @@
1
- import chromadb
2
- from chromadb.utils import embedding_functions
3
- from sentence_transformers import SentenceTransformer
4
- from transformers import pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import streamlit as st
6
- import fitz # PyMuPDF for PDF parsing
7
-
8
- # Step 1: Setup ChromaDB
9
- def setup_chromadb():
10
- # Initialize ChromaDB in-memory instance
11
- client = chromadb.Client()
12
- try:
13
- client.delete_collection("pdf_data")
14
- print("Existing collection 'pdf_data' deleted.")
15
- except:
16
- print("Collection 'pdf_data' not found, creating a new one.")
17
- # Create a new collection with the embedding function
18
- ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
19
- collection = client.create_collection("pdf_data", embedding_function=ef)
20
- return client, collection
21
-
22
- # Step 2: Extract Text from PDF
23
- def extract_text_from_pdf(pdf_path):
24
- pdf_text = ""
25
- with fitz.open(pdf_path) as doc:
26
- for page in doc:
27
- pdf_text += page.get_text()
28
- return pdf_text
29
-
30
- # Step 3: Add Extracted Text to Vector Database
31
- def add_pdf_text_to_db(collection, pdf_text):
32
- sentences = pdf_text.split("\n") # Split text into lines for granularity
33
- for idx, sentence in enumerate(sentences):
34
- if sentence.strip(): # Avoid empty lines
35
- collection.add(
36
- ids=[f"pdf_text_{idx}"],
37
- documents=[sentence],
38
- metadatas={"line_number": idx, "text": sentence}
39
- )
40
-
41
- # Step 4: Query Function
42
- def query_pdf_data(collection, query, retriever_model):
43
- results = collection.query(
44
- query_texts=[query],
45
- n_results=3
46
- )
47
- context = " ".join([doc for doc in results["documents"][0]])
48
- answer = retriever_model(f"Context: {context}\nQuestion: {query}")
49
- return answer, results["metadatas"]
50
-
51
- # Streamlit Interface
52
- def main():
53
- st.title("PDF Chatbot with Retrieval-Augmented Generation")
54
- st.write("Upload a PDF, and ask questions about its content!")
55
-
56
- # Initialize components
57
- client, collection = setup_chromadb()
58
- retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
59
-
60
- # File upload
61
- uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
62
- if uploaded_file:
63
- st.write("Extracting text and populating the database...")
64
- pdf_text = extract_text_from_pdf(uploaded_file)
65
- add_pdf_text_to_db(collection, pdf_text)
66
- st.success("PDF text has been added to the database. You can now query it!")
67
-
68
- # Query Input
69
- query = st.text_input("Enter your query about the PDF:")
70
- if query:
71
- try:
72
- answer, metadata = query_pdf_data(collection, query, retriever_model)
73
- st.subheader("Answer:")
74
- st.write(answer[0]['generated_text'])
75
- st.subheader("Retrieved Context:")
76
- for meta in metadata[0]:
77
- st.write(meta)
78
- except Exception as e:
79
- st.error(f"An error occurred: {str(e)}")
80
-
81
- if __name__ == "__main__":
82
- main()
83
 
 
 
 
 
84
 
 
85
 
 
 
 
 
 
86
 
87
 
88
 
 
1
+ # import chromadb
2
+ # from chromadb.utils import embedding_functions
3
+ # from sentence_transformers import SentenceTransformer
4
+ # from transformers import pipeline
5
+ # import streamlit as st
6
+ # import fitz # PyMuPDF for PDF parsing
7
+
8
+ # # Step 1: Setup ChromaDB
9
+ # def setup_chromadb():
10
+ # # Initialize ChromaDB in-memory instance
11
+ # client = chromadb.Client()
12
+ # try:
13
+ # client.delete_collection("pdf_data")
14
+ # print("Existing collection 'pdf_data' deleted.")
15
+ # except:
16
+ # print("Collection 'pdf_data' not found, creating a new one.")
17
+ # # Create a new collection with the embedding function
18
+ # ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
19
+ # collection = client.create_collection("pdf_data", embedding_function=ef)
20
+ # return client, collection
21
+
22
+ # # Step 2: Extract Text from PDF
23
+ # def extract_text_from_pdf(pdf_path):
24
+ # pdf_text = ""
25
+ # with fitz.open(pdf_path) as doc:
26
+ # for page in doc:
27
+ # pdf_text += page.get_text()
28
+ # return pdf_text
29
+
30
+ # # Step 3: Add Extracted Text to Vector Database
31
+ # def add_pdf_text_to_db(collection, pdf_text):
32
+ # sentences = pdf_text.split("\n") # Split text into lines for granularity
33
+ # for idx, sentence in enumerate(sentences):
34
+ # if sentence.strip(): # Avoid empty lines
35
+ # collection.add(
36
+ # ids=[f"pdf_text_{idx}"],
37
+ # documents=[sentence],
38
+ # metadatas={"line_number": idx, "text": sentence}
39
+ # )
40
+
41
+ # # Step 4: Query Function
42
+ # def query_pdf_data(collection, query, retriever_model):
43
+ # results = collection.query(
44
+ # query_texts=[query],
45
+ # n_results=3
46
+ # )
47
+ # context = " ".join([doc for doc in results["documents"][0]])
48
+ # answer = retriever_model(f"Context: {context}\nQuestion: {query}")
49
+ # return answer, results["metadatas"]
50
+
51
+ # # Streamlit Interface
52
+ # def main():
53
+ # st.title("PDF Chatbot with Retrieval-Augmented Generation")
54
+ # st.write("Upload a PDF, and ask questions about its content!")
55
+
56
+ # # Initialize components
57
+ # client, collection = setup_chromadb()
58
+ # retriever_model = pipeline("text2text-generation", model="google/flan-t5-small") # Free LLM
59
+
60
+ # # File upload
61
+ # uploaded_file = st.file_uploader("Upload your PDF file", type="pdf")
62
+ # if uploaded_file:
63
+ # st.write("Extracting text and populating the database...")
64
+ # pdf_text = extract_text_from_pdf(uploaded_file)
65
+ # add_pdf_text_to_db(collection, pdf_text)
66
+ # st.success("PDF text has been added to the database. You can now query it!")
67
+
68
+ # # Query Input
69
+ # query = st.text_input("Enter your query about the PDF:")
70
+ # if query:
71
+ # try:
72
+ # answer, metadata = query_pdf_data(collection, query, retriever_model)
73
+ # st.subheader("Answer:")
74
+ # st.write(answer[0]['generated_text'])
75
+ # st.subheader("Retrieved Context:")
76
+ # for meta in metadata[0]:
77
+ # st.write(meta)
78
+ # except Exception as e:
79
+ # st.error(f"An error occurred: {str(e)}")
80
+
81
+ # if __name__ == "__main__":
82
+ # main()
83
+
84
+
85
+
86
  import streamlit as st
87
+ from streamlit_chromadb_connection.chromadb_connection import ChromadbConnection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ configuration = {
90
+ "client": "PersistentClient",
91
+ "path": "/tmp/.chroma"
92
+ }
93
 
94
+ collection_name = "documents_collection"
95
 
96
+ conn = st.connection("chromadb",
97
+ type=ChromaDBConnection,
98
+ **configuration)
99
+ documents_collection_df = conn.get_collection_data(collection_name)
100
+ st.dataframe(documents_collection_df)
101
 
102
 
103