init
Browse files- Vector_Embeddings.py +5 -0
- pages/1_π1_Just_ChromaDB.py +35 -0
- pages/2_πΎ2_ChromaDB_Collection_Query.py +62 -0
- pages/3_π«3_Summary.py +80 -0
- pages/5_2_ChromaDB_LangChain_directory.py +50 -0
- pages/8_π§Ή2_Read_PDF.py +25 -0
- requirements.txt +7 -0
Vector_Embeddings.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from hypecheth
|
2 |
+
from utils.st_def import st_main_contents, st_logo
|
3 |
+
|
4 |
+
st_logo(title='Welcome π to PDF Summarizer!', page_title="PDF Summarizer",)
|
5 |
+
st_main_contents()
|
pages/1_π1_Just_ChromaDB.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys, os
|
2 |
+
if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ:
|
3 |
+
print("server side---------------------------")
|
4 |
+
__import__('pysqlite3')
|
5 |
+
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
|
6 |
+
else:
|
7 |
+
print("local ----------------------------------side ")
|
8 |
+
|
9 |
+
import sqlite3
|
10 |
+
import streamlit as st
|
11 |
+
from streamlit import logger
|
12 |
+
from utils.st_def import st_logo, st_load_book
|
13 |
+
import chromadb
|
14 |
+
|
15 |
+
st_logo(title='Welcome π to Chroma DB!', page_title="Chroma DB ",)
|
16 |
+
st_load_book()
|
17 |
+
#-----------------------------------------------
|
18 |
+
st.write(logger.get_logger("SMI_APP"))
|
19 |
+
# st.write(f"sys version: {sys.version}")
|
20 |
+
# st.header(f"sqlite version: {sqlite3.sqlite_version}")
|
21 |
+
#-----------------------------------------------
|
22 |
+
chroma_client = chromadb.Client()
|
23 |
+
collection = chroma_client.get_or_create_collection(name="collection1_1")
|
24 |
+
collection.add(
|
25 |
+
documents=["steak", "python", "tiktok", "safety", "health", "environment"],
|
26 |
+
metadatas=[{"source": "food"}, {"source": "progamming language"}, {"source": "social media"}, {"source": "government"}, {"source": "body"}, {"source": "living condition"}],
|
27 |
+
ids=["id1", "id2", "id3", "id4", "id5", "id6"]
|
28 |
+
)
|
29 |
+
|
30 |
+
qa = st.text_input('π Ask the Chroma: ')
|
31 |
+
if qa:
|
32 |
+
results = collection.query(query_texts=[qa], n_results=1)
|
33 |
+
st.write(results)
|
34 |
+
|
35 |
+
|
pages/2_πΎ2_ChromaDB_Collection_Query.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import sys, os
|
2 |
+
# if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ: __import__('pysqlite3'); sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
|
3 |
+
|
4 |
+
import streamlit as st
|
5 |
+
from utils import st_def
|
6 |
+
|
7 |
+
import chromadb #0.4.24
|
8 |
+
from chromadb.utils import embedding_functions
|
9 |
+
|
10 |
+
st_def.st_logo(title='Welcome π to Chroma DB!', page_title="Chroma DB ",)
|
11 |
+
st_def.st_load_book()
|
12 |
+
#-----------------------------------------------
|
13 |
+
EB_MODEL = "all-MiniLM-L6-v2"
|
14 |
+
COL_NAME = "collection2"
|
15 |
+
|
16 |
+
with st.spinner('Loading files...'):
|
17 |
+
client = chromadb.Client()
|
18 |
+
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EB_MODEL)
|
19 |
+
collection = client.get_or_create_collection(name=COL_NAME, embedding_function=embedding_func, metadata={"hnsw:space": "cosine"},)
|
20 |
+
st.markdown("### Documents in Chroma DB")
|
21 |
+
documents = [
|
22 |
+
"The latest iPhone model comes with impressive features and a powerful camera.",
|
23 |
+
"Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
|
24 |
+
"Einstein's theory of relativity revolutionized our understanding of space and time.",
|
25 |
+
"Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
|
26 |
+
"The American Revolution had a profound impact on the birth of the United States as a nation.",
|
27 |
+
"Regular exercise and a balanced diet are essential for maintaining good physical health.",
|
28 |
+
"Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
|
29 |
+
"Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
|
30 |
+
"Startup companies often face challenges in securing funding and scaling their operations.",
|
31 |
+
"Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
|
32 |
+
"Toronto is a nice place.",
|
33 |
+
]
|
34 |
+
genres = ["technology","travel","science","food","history","fitness", "art","climate change","business","music","country",]
|
35 |
+
collection.add(
|
36 |
+
documents=documents,
|
37 |
+
ids=[f"id{i}" for i in range(len(documents))],
|
38 |
+
metadatas=[{"genre": g} for g in genres]
|
39 |
+
)
|
40 |
+
|
41 |
+
for doc, genre in zip(documents, genres):
|
42 |
+
st.write(f"{doc} ( {genre})")
|
43 |
+
|
44 |
+
# url = st.text_input('π Ask questions about above: ')
|
45 |
+
|
46 |
+
if "msg1" not in st.session_state:
|
47 |
+
st.session_state.msg1 = [] #111
|
48 |
+
st.session_state.msg1.append({"role": "system", 'content': "hi"})
|
49 |
+
st.session_state.msg1.append({"role": "assistant", "content": "How May I Help You Todayπ¬?"})
|
50 |
+
|
51 |
+
for message in st.session_state.msg1[1:]:
|
52 |
+
with st.chat_message(message["role"]): st.markdown(message["content"]) #222
|
53 |
+
|
54 |
+
if prompt := st.chat_input("π¬Ask me anything about the documents above!π¦"):
|
55 |
+
with st.chat_message("user"): st.markdown(prompt)
|
56 |
+
st.session_state.msg1.append({"role": "user", "content": prompt})
|
57 |
+
|
58 |
+
response = collection.query(query_texts=[f"{prompt}"], n_results=2,)
|
59 |
+
with st.chat_message("assistant"):
|
60 |
+
st.markdown(response['documents'][0][0])
|
61 |
+
st.markdown(response['metadatas'][0][0]['genre'])
|
62 |
+
st.session_state.msg1.append({"role": "assistant", "content": response['documents'][0][0]})
|
pages/3_π«3_Summary.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st, time
|
2 |
+
from utils import st_def, ut_openai
|
3 |
+
|
4 |
+
st_def.st_logo(title = "Welcome π to Summary!", page_title="Summary",)
|
5 |
+
st_def.st_summary()
|
6 |
+
openai_api_key= st_def.st_sidebar()
|
7 |
+
#------------------------------------------------------------------------
|
8 |
+
def init():
|
9 |
+
if 'page_text' not in st.session_state:
|
10 |
+
st.error('Read PDF before continue ... ')
|
11 |
+
return False
|
12 |
+
elif not openai_api_key:
|
13 |
+
st.error("Please add your OpenAI API key to continue.")
|
14 |
+
return False
|
15 |
+
else:
|
16 |
+
return True
|
17 |
+
|
18 |
+
|
19 |
+
def combine_chunks(summaries):
|
20 |
+
chunks = [] # combine chunks of "summaries" array into one string of max 4000 characters
|
21 |
+
summary = ""
|
22 |
+
|
23 |
+
for sum in summaries:
|
24 |
+
if len(sum) + len(summary) > 4000:
|
25 |
+
chunks.append(summary)
|
26 |
+
summary = ""
|
27 |
+
summary += sum
|
28 |
+
|
29 |
+
if len(chunks) == 0: chunks.append(summary)
|
30 |
+
return chunks
|
31 |
+
|
32 |
+
|
33 |
+
def main():
|
34 |
+
if not init(): return
|
35 |
+
|
36 |
+
page_text_array = st.session_state['page_text'] # array, store pages. len(text) is pages.
|
37 |
+
print("Summarizing text..."+str(len(page_text_array)))
|
38 |
+
|
39 |
+
combined_summaries = combine_chunks(page_text_array)
|
40 |
+
print("Found " + str(len(combined_summaries)) + " chunks to summarize.")
|
41 |
+
|
42 |
+
iterations = 1
|
43 |
+
while True:
|
44 |
+
if len(combined_summaries) <= 1: break
|
45 |
+
|
46 |
+
summaries_of_summaries = []
|
47 |
+
# print summaries
|
48 |
+
for i, summary in enumerate(combined_summaries):
|
49 |
+
prompt =f"""
|
50 |
+
Your task is to extract relevant information from a text on the page of a book. This information will be used to create a book summary.
|
51 |
+
Extract relevant information from the following text, which is delimited with triple backticks.\
|
52 |
+
Be sure to preserve the important details.
|
53 |
+
Text: ```{combined_summaries[i]}```
|
54 |
+
"""
|
55 |
+
# st.write(f"Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...")
|
56 |
+
st.markdown(f'<span style="color:blue">Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...</span>', unsafe_allow_html=True)
|
57 |
+
sum_page = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user", "content": prompt},])
|
58 |
+
summaries_of_summaries.append(sum_page)
|
59 |
+
st.text(sum_page)
|
60 |
+
time.sleep(2) #You can query the model only 3 times in a minute for free, so we need to put some delay
|
61 |
+
|
62 |
+
st.write('summaries_of_summaries')
|
63 |
+
st.write(summaries_of_summaries)
|
64 |
+
combined_summaries = combine_chunks(summaries_of_summaries)
|
65 |
+
st.text('combined_summaries')
|
66 |
+
st.text(combined_summaries)
|
67 |
+
iterations += 1
|
68 |
+
|
69 |
+
# summarize last chunk
|
70 |
+
|
71 |
+
with st.spinner("Summarizing last chunk..."):
|
72 |
+
final_summary = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user", "content": combined_summaries[0]},])
|
73 |
+
st.header("Final Summary")
|
74 |
+
st.write(final_summary)
|
75 |
+
st.success("π¨Cheers!")
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
main()
|
80 |
+
|
pages/5_2_ChromaDB_LangChain_directory.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.vectorstores import Chroma
|
2 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
3 |
+
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
|
5 |
+
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
6 |
+
|
7 |
+
import streamlit as st, os
|
8 |
+
from utils import st_def, utilities
|
9 |
+
openai_api_key = st_def.st_sidebar()
|
10 |
+
|
11 |
+
def load_docs(directory):
|
12 |
+
loader = DirectoryLoader(directory)
|
13 |
+
documents = loader.load()
|
14 |
+
return documents
|
15 |
+
|
16 |
+
|
17 |
+
with st.spinner('Loading files...'):
|
18 |
+
documents = load_docs('data/pets_txt/')
|
19 |
+
file_names = [os.path.basename(doc.metadata['source']) for doc in documents]
|
20 |
+
st.write('\n\n'.join(file_names))
|
21 |
+
|
22 |
+
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
|
23 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
24 |
+
docs = text_splitter.split_documents(documents)
|
25 |
+
return docs
|
26 |
+
|
27 |
+
docs = split_docs(documents)
|
28 |
+
|
29 |
+
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
30 |
+
|
31 |
+
db = Chroma.from_documents(documents=docs,embedding=embedding_function,)
|
32 |
+
|
33 |
+
if "messages2" not in st.session_state:
|
34 |
+
st.session_state.messages2 = [] #111
|
35 |
+
st.session_state.messages2.append({"role": "system", 'content': "hi"})
|
36 |
+
st.session_state.messages2.append({"role": "assistant", "content": "How May I Help You Todayπ¬?"})
|
37 |
+
|
38 |
+
for message in st.session_state.messages2[1:]:
|
39 |
+
with st.chat_message(message["role"]): st.markdown(message["content"]) #222
|
40 |
+
|
41 |
+
if prompt := st.chat_input("π¬Ask me anything about the documents above!π¦"):
|
42 |
+
with st.chat_message("user"): st.markdown(prompt)
|
43 |
+
st.session_state.messages2.append({"role": "user", "content": prompt})
|
44 |
+
|
45 |
+
matching_docs = db.similarity_search(prompt)
|
46 |
+
with st.chat_message("assistant"):
|
47 |
+
st.markdown(matching_docs[0].page_content)
|
48 |
+
st.session_state.messages2.append({"role": "assistant", "content": matching_docs[0].page_content})
|
49 |
+
|
50 |
+
# query = "What are the emotional benefits of owning a pet?"
|
pages/8_π§Ή2_Read_PDF.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utils.st_def import st_logo, st_read_pdf
|
3 |
+
|
4 |
+
st_logo(title = "Welcome π to Text Cleaning!", page_title="Text Cleaning",)
|
5 |
+
st_read_pdf()
|
6 |
+
#------------------------------------------------------------------------
|
7 |
+
import openai, PyPDF2, os, time, pandas as pd
|
8 |
+
|
9 |
+
if 'pdfreader' not in st.session_state:
|
10 |
+
st.error('Load PDF before continue ... ')
|
11 |
+
else:
|
12 |
+
page_text=[] #array for page
|
13 |
+
summary=' '
|
14 |
+
pr = st.session_state['pdfreader']
|
15 |
+
with st.spinner('Loading files...'):
|
16 |
+
for i in range(0,len(pr.pages)):
|
17 |
+
# creating a page object
|
18 |
+
pageObj = pr.pages[i].extract_text() # extract one page's text
|
19 |
+
pageObj= pageObj.replace('\t\r','') # tab, enter
|
20 |
+
pageObj= pageObj.replace('\xa0','') # non-breaking spaces
|
21 |
+
# extracting text from page
|
22 |
+
page_text.append(pageObj) # the whole pdf --> txt
|
23 |
+
|
24 |
+
st.session_state['page_text'] = page_text
|
25 |
+
st.write(page_text)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
streamlit_extras
|
3 |
+
openai
|
4 |
+
tenacity
|
5 |
+
PyPDF2
|
6 |
+
chromadb
|
7 |
+
pysqlite3-binary
|