aiXpert commited on
Commit
5a95a6c
Β·
1 Parent(s): 302db9d
Vector_Embeddings.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # from hypecheth
2
+ from utils.st_def import st_main_contents, st_logo
3
+
4
+ st_logo(title='Welcome πŸ‘‹ to PDF Summarizer!', page_title="PDF Summarizer",)
5
+ st_main_contents()
pages/1_πŸ“Š1_Just_ChromaDB.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os
2
+ if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ:
3
+ print("server side---------------------------")
4
+ __import__('pysqlite3')
5
+ sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
6
+ else:
7
+ print("local ----------------------------------side ")
8
+
9
+ import sqlite3
10
+ import streamlit as st
11
+ from streamlit import logger
12
+ from utils.st_def import st_logo, st_load_book
13
+ import chromadb
14
+
15
+ st_logo(title='Welcome πŸ‘‹ to Chroma DB!', page_title="Chroma DB ",)
16
+ st_load_book()
17
+ #-----------------------------------------------
18
+ st.write(logger.get_logger("SMI_APP"))
19
+ # st.write(f"sys version: {sys.version}")
20
+ # st.header(f"sqlite version: {sqlite3.sqlite_version}")
21
+ #-----------------------------------------------
22
+ chroma_client = chromadb.Client()
23
+ collection = chroma_client.get_or_create_collection(name="collection1_1")
24
+ collection.add(
25
+ documents=["steak", "python", "tiktok", "safety", "health", "environment"],
26
+ metadatas=[{"source": "food"}, {"source": "progamming language"}, {"source": "social media"}, {"source": "government"}, {"source": "body"}, {"source": "living condition"}],
27
+ ids=["id1", "id2", "id3", "id4", "id5", "id6"]
28
+ )
29
+
30
+ qa = st.text_input('🌐 Ask the Chroma: ')
31
+ if qa:
32
+ results = collection.query(query_texts=[qa], n_results=1)
33
+ st.write(results)
34
+
35
+
pages/2_πŸ’Ύ2_ChromaDB_Collection_Query.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import sys, os
2
+ # if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ: __import__('pysqlite3'); sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
3
+
4
+ import streamlit as st
5
+ from utils import st_def
6
+
7
+ import chromadb #0.4.24
8
+ from chromadb.utils import embedding_functions
9
+
10
+ st_def.st_logo(title='Welcome πŸ‘‹ to Chroma DB!', page_title="Chroma DB ",)
11
+ st_def.st_load_book()
12
+ #-----------------------------------------------
13
+ EB_MODEL = "all-MiniLM-L6-v2"
14
+ COL_NAME = "collection2"
15
+
16
+ with st.spinner('Loading files...'):
17
+ client = chromadb.Client()
18
+ embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EB_MODEL)
19
+ collection = client.get_or_create_collection(name=COL_NAME, embedding_function=embedding_func, metadata={"hnsw:space": "cosine"},)
20
+ st.markdown("### Documents in Chroma DB")
21
+ documents = [
22
+ "The latest iPhone model comes with impressive features and a powerful camera.",
23
+ "Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
24
+ "Einstein's theory of relativity revolutionized our understanding of space and time.",
25
+ "Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
26
+ "The American Revolution had a profound impact on the birth of the United States as a nation.",
27
+ "Regular exercise and a balanced diet are essential for maintaining good physical health.",
28
+ "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
29
+ "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
30
+ "Startup companies often face challenges in securing funding and scaling their operations.",
31
+ "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
32
+ "Toronto is a nice place.",
33
+ ]
34
+ genres = ["technology","travel","science","food","history","fitness", "art","climate change","business","music","country",]
35
+ collection.add(
36
+ documents=documents,
37
+ ids=[f"id{i}" for i in range(len(documents))],
38
+ metadatas=[{"genre": g} for g in genres]
39
+ )
40
+
41
+ for doc, genre in zip(documents, genres):
42
+ st.write(f"{doc} ( {genre})")
43
+
44
+ # url = st.text_input('🌐 Ask questions about above: ')
45
+
46
+ if "msg1" not in st.session_state:
47
+ st.session_state.msg1 = [] #111
48
+ st.session_state.msg1.append({"role": "system", 'content': "hi"})
49
+ st.session_state.msg1.append({"role": "assistant", "content": "How May I Help You TodayπŸ’¬?"})
50
+
51
+ for message in st.session_state.msg1[1:]:
52
+ with st.chat_message(message["role"]): st.markdown(message["content"]) #222
53
+
54
+ if prompt := st.chat_input("πŸ’¬Ask me anything about the documents above!🍦"):
55
+ with st.chat_message("user"): st.markdown(prompt)
56
+ st.session_state.msg1.append({"role": "user", "content": prompt})
57
+
58
+ response = collection.query(query_texts=[f"{prompt}"], n_results=2,)
59
+ with st.chat_message("assistant"):
60
+ st.markdown(response['documents'][0][0])
61
+ st.markdown(response['metadatas'][0][0]['genre'])
62
+ st.session_state.msg1.append({"role": "assistant", "content": response['documents'][0][0]})
pages/3_πŸ«•3_Summary.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st, time
2
+ from utils import st_def, ut_openai
3
+
4
+ st_def.st_logo(title = "Welcome πŸ‘‹ to Summary!", page_title="Summary",)
5
+ st_def.st_summary()
6
+ openai_api_key= st_def.st_sidebar()
7
+ #------------------------------------------------------------------------
8
+ def init():
9
+ if 'page_text' not in st.session_state:
10
+ st.error('Read PDF before continue ... ')
11
+ return False
12
+ elif not openai_api_key:
13
+ st.error("Please add your OpenAI API key to continue.")
14
+ return False
15
+ else:
16
+ return True
17
+
18
+
19
+ def combine_chunks(summaries):
20
+ chunks = [] # combine chunks of "summaries" array into one string of max 4000 characters
21
+ summary = ""
22
+
23
+ for sum in summaries:
24
+ if len(sum) + len(summary) > 4000:
25
+ chunks.append(summary)
26
+ summary = ""
27
+ summary += sum
28
+
29
+ if len(chunks) == 0: chunks.append(summary)
30
+ return chunks
31
+
32
+
33
+ def main():
34
+ if not init(): return
35
+
36
+ page_text_array = st.session_state['page_text'] # array, store pages. len(text) is pages.
37
+ print("Summarizing text..."+str(len(page_text_array)))
38
+
39
+ combined_summaries = combine_chunks(page_text_array)
40
+ print("Found " + str(len(combined_summaries)) + " chunks to summarize.")
41
+
42
+ iterations = 1
43
+ while True:
44
+ if len(combined_summaries) <= 1: break
45
+
46
+ summaries_of_summaries = []
47
+ # print summaries
48
+ for i, summary in enumerate(combined_summaries):
49
+ prompt =f"""
50
+ Your task is to extract relevant information from a text on the page of a book. This information will be used to create a book summary.
51
+ Extract relevant information from the following text, which is delimited with triple backticks.\
52
+ Be sure to preserve the important details.
53
+ Text: ```{combined_summaries[i]}```
54
+ """
55
+ # st.write(f"Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...")
56
+ st.markdown(f'<span style="color:blue">Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...</span>', unsafe_allow_html=True)
57
+ sum_page = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user", "content": prompt},])
58
+ summaries_of_summaries.append(sum_page)
59
+ st.text(sum_page)
60
+ time.sleep(2) #You can query the model only 3 times in a minute for free, so we need to put some delay
61
+
62
+ st.write('summaries_of_summaries')
63
+ st.write(summaries_of_summaries)
64
+ combined_summaries = combine_chunks(summaries_of_summaries)
65
+ st.text('combined_summaries')
66
+ st.text(combined_summaries)
67
+ iterations += 1
68
+
69
+ # summarize last chunk
70
+
71
+ with st.spinner("Summarizing last chunk..."):
72
+ final_summary = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user", "content": combined_summaries[0]},])
73
+ st.header("Final Summary")
74
+ st.write(final_summary)
75
+ st.success("🚨Cheers!")
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
80
+
pages/5_2_ChromaDB_LangChain_directory.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import Chroma
2
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
3
+ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
4
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
5
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
6
+
7
+ import streamlit as st, os
8
+ from utils import st_def, utilities
9
+ openai_api_key = st_def.st_sidebar()
10
+
11
+ def load_docs(directory):
12
+ loader = DirectoryLoader(directory)
13
+ documents = loader.load()
14
+ return documents
15
+
16
+
17
+ with st.spinner('Loading files...'):
18
+ documents = load_docs('data/pets_txt/')
19
+ file_names = [os.path.basename(doc.metadata['source']) for doc in documents]
20
+ st.write('\n\n'.join(file_names))
21
+
22
+ def split_docs(documents,chunk_size=1000,chunk_overlap=20):
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
24
+ docs = text_splitter.split_documents(documents)
25
+ return docs
26
+
27
+ docs = split_docs(documents)
28
+
29
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
30
+
31
+ db = Chroma.from_documents(documents=docs,embedding=embedding_function,)
32
+
33
+ if "messages2" not in st.session_state:
34
+ st.session_state.messages2 = [] #111
35
+ st.session_state.messages2.append({"role": "system", 'content': "hi"})
36
+ st.session_state.messages2.append({"role": "assistant", "content": "How May I Help You TodayπŸ’¬?"})
37
+
38
+ for message in st.session_state.messages2[1:]:
39
+ with st.chat_message(message["role"]): st.markdown(message["content"]) #222
40
+
41
+ if prompt := st.chat_input("πŸ’¬Ask me anything about the documents above!🍦"):
42
+ with st.chat_message("user"): st.markdown(prompt)
43
+ st.session_state.messages2.append({"role": "user", "content": prompt})
44
+
45
+ matching_docs = db.similarity_search(prompt)
46
+ with st.chat_message("assistant"):
47
+ st.markdown(matching_docs[0].page_content)
48
+ st.session_state.messages2.append({"role": "assistant", "content": matching_docs[0].page_content})
49
+
50
+ # query = "What are the emotional benefits of owning a pet?"
pages/8_🧹2_Read_PDF.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from utils.st_def import st_logo, st_read_pdf
3
+
4
+ st_logo(title = "Welcome πŸ‘‹ to Text Cleaning!", page_title="Text Cleaning",)
5
+ st_read_pdf()
6
+ #------------------------------------------------------------------------
7
+ import openai, PyPDF2, os, time, pandas as pd
8
+
9
+ if 'pdfreader' not in st.session_state:
10
+ st.error('Load PDF before continue ... ')
11
+ else:
12
+ page_text=[] #array for page
13
+ summary=' '
14
+ pr = st.session_state['pdfreader']
15
+ with st.spinner('Loading files...'):
16
+ for i in range(0,len(pr.pages)):
17
+ # creating a page object
18
+ pageObj = pr.pages[i].extract_text() # extract one page's text
19
+ pageObj= pageObj.replace('\t\r','') # tab, enter
20
+ pageObj= pageObj.replace('\xa0','') # non-breaking spaces
21
+ # extracting text from page
22
+ page_text.append(pageObj) # the whole pdf --> txt
23
+
24
+ st.session_state['page_text'] = page_text
25
+ st.write(page_text)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ streamlit_extras
3
+ openai
4
+ tenacity
5
+ PyPDF2
6
+ chromadb
7
+ pysqlite3-binary