Spaces:

aiXpert
/

chromadb

Runtime error

App Files Files Community

aiXpert commited on Apr 6, 2024

Commit

5a95a6c

1 Parent(s): 302db9d

init

Browse files

Files changed (7) hide show

Vector_Embeddings.py +5 -0
pages/1_📊1_Just_ChromaDB.py +35 -0
pages/2_💾2_ChromaDB_Collection_Query.py +62 -0
pages/3_🫕3_Summary.py +80 -0
pages/5_2_ChromaDB_LangChain_directory.py +50 -0
pages/8_🧹2_Read_PDF.py +25 -0
requirements.txt +7 -0

Vector_Embeddings.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# from hypecheth
+from utils.st_def import st_main_contents, st_logo
+st_logo(title='Welcome 👋 to PDF Summarizer!', page_title="PDF Summarizer",)
+st_main_contents()

pages/1_📊1_Just_ChromaDB.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import sys, os
+if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ:
+    print("server side---------------------------")
+    __import__('pysqlite3')
+    sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+else:
+    print("local ----------------------------------side ")
+import sqlite3
+import streamlit as st
+from streamlit import logger
+from utils.st_def import st_logo, st_load_book
+import chromadb
+st_logo(title='Welcome 👋 to Chroma DB!', page_title="Chroma DB ",)
+st_load_book()
+#-----------------------------------------------
+st.write(logger.get_logger("SMI_APP"))
+# st.write(f"sys version: {sys.version}")
+# st.header(f"sqlite version: {sqlite3.sqlite_version}")
+#-----------------------------------------------
+chroma_client = chromadb.Client()
+collection = chroma_client.get_or_create_collection(name="collection1_1")
+collection.add(
+    documents=["steak", "python", "tiktok", "safety", "health", "environment"],
+    metadatas=[{"source": "food"}, {"source": "progamming language"}, {"source": "social media"}, {"source": "government"}, {"source": "body"}, {"source": "living condition"}],
+    ids=["id1", "id2", "id3", "id4", "id5", "id6"]
+)
+qa = st.text_input('🌐 Ask the Chroma: ')
+if qa:
+    results = collection.query(query_texts=[qa],    n_results=1)
+    st.write(results)

pages/2_💾2_ChromaDB_Collection_Query.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# import sys, os
+# if "STREAMLIT_SERVER_ENABLED" in os.environ and "IS_STREAMLIT_SERVER" in os.environ: __import__('pysqlite3'); sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+import streamlit as st
+from utils import st_def
+import chromadb     #0.4.24
+from   chromadb.utils import embedding_functions
+st_def.st_logo(title='Welcome 👋 to Chroma DB!', page_title="Chroma DB ",)
+st_def.st_load_book()
+#-----------------------------------------------
+EB_MODEL = "all-MiniLM-L6-v2"
+COL_NAME = "collection2"
+with st.spinner('Loading files...'):
+    client = chromadb.Client()
+    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EB_MODEL)
+    collection = client.get_or_create_collection(name=COL_NAME,  embedding_function=embedding_func,  metadata={"hnsw:space": "cosine"},)
+    st.markdown("### Documents in Chroma DB")
+    documents = [
+        "The latest iPhone model comes with impressive features and a powerful camera.",
+        "Exploring the beautiful beaches and vibrant culture of Bali is a dream for many travelers.",
+        "Einstein's theory of relativity revolutionized our understanding of space and time.",
+        "Traditional Italian pizza is famous for its thin crust, fresh ingredients, and wood-fired ovens.",
+        "The American Revolution had a profound impact on the birth of the United States as a nation.",
+        "Regular exercise and a balanced diet are essential for maintaining good physical health.",
+        "Leonardo da Vinci's Mona Lisa is considered one of the most iconic paintings in art history.",
+        "Climate change poses a significant threat to the planet's ecosystems and biodiversity.",
+        "Startup companies often face challenges in securing funding and scaling their operations.",
+        "Beethoven's Symphony No. 9 is celebrated for its powerful choral finale, 'Ode to Joy.'",
+        "Toronto is a nice place.",
+    ]
+    genres = ["technology","travel","science","food","history","fitness",        "art","climate change","business","music","country",]
+    collection.add(
+        documents=documents,
+        ids=[f"id{i}" for i in range(len(documents))],
+        metadatas=[{"genre": g} for g in genres]
+    )
+for doc, genre in zip(documents, genres):
+    st.write(f"{doc} ( {genre})")
+# url = st.text_input('🌐 Ask questions about above: ')
+if "msg1" not in st.session_state:
+    st.session_state.msg1 = []      #111
+    st.session_state.msg1.append({"role": "system", 'content': "hi"})
+    st.session_state.msg1.append({"role": "assistant",   "content": "How May I Help You Today💬?"})
+for message in st.session_state.msg1[1:]:
+    with st.chat_message(message["role"]):  st.markdown(message["content"])     #222
+if prompt := st.chat_input("💬Ask me anything about the documents above!🍦"):
+    with st.chat_message("user"):           st.markdown(prompt)
+    st.session_state.msg1.append({"role": "user", "content": prompt})
+    response = collection.query(query_texts=[f"{prompt}"],        n_results=2,)
+    with st.chat_message("assistant"):
+        st.markdown(response['documents'][0][0])
+        st.markdown(response['metadatas'][0][0]['genre'])
+    st.session_state.msg1.append({"role": "assistant", "content": response['documents'][0][0]})

pages/3_🫕3_Summary.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import streamlit as st, time
+from utils import st_def, ut_openai
+st_def.st_logo(title = "Welcome 👋 to Summary!", page_title="Summary",)
+st_def.st_summary()
+openai_api_key= st_def.st_sidebar()
+#------------------------------------------------------------------------
+def init():
+    if 'page_text' not in st.session_state:
+        st.error('Read PDF before continue ... ')
+        return False
+    elif not openai_api_key:
+        st.error("Please add your OpenAI API key to continue.")
+        return False
+    else:
+        return True
+def combine_chunks(summaries):
+    chunks = []         # combine chunks of "summaries" array into one string of max 4000 characters
+    summary = ""
+    for sum in summaries:
+        if len(sum) + len(summary) > 4000:
+            chunks.append(summary)
+            summary = ""
+        summary += sum
+    if len(chunks) == 0:        chunks.append(summary)
+    return chunks
+def main():
+    if not init(): return
+    page_text_array = st.session_state['page_text']         # array, store pages. len(text) is pages.
+    print("Summarizing text..."+str(len(page_text_array)))
+    combined_summaries = combine_chunks(page_text_array)
+    print("Found " + str(len(combined_summaries)) + " chunks to summarize.")
+    iterations = 1
+    while True:
+        if len(combined_summaries) <= 1:    break
+        summaries_of_summaries = []
+        # print summaries
+        for i, summary in enumerate(combined_summaries):
+            prompt =f"""
+                Your task is to extract relevant information from a text on the page of a book. This information will be used to create a book summary.
+                Extract relevant information from the following text, which is delimited with triple backticks.\
+                Be sure to preserve the important details.
+                Text: ```{combined_summaries[i]}```
+            """
+            #    st.write(f"Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...")
+            st.markdown(f'<span style="color:blue">Summarizing {i + 1} of {len(combined_summaries)}, iteration {iterations}...</span>', unsafe_allow_html=True)
+            sum_page = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user",   "content": prompt},])
+            summaries_of_summaries.append(sum_page)
+            st.text(sum_page)
+            time.sleep(2)  #You can query the model only 3 times in a minute for free, so we need to put some delay
+        st.write('summaries_of_summaries')
+        st.write(summaries_of_summaries)
+        combined_summaries = combine_chunks(summaries_of_summaries)
+        st.text('combined_summaries')
+        st.text(combined_summaries)
+        iterations += 1
+    # summarize last chunk
+    with st.spinner("Summarizing last chunk..."):
+        final_summary = ut_openai.aichat(openai_api_key=openai_api_key, messages = [{"role": "user",   "content": combined_summaries[0]},])
+    st.header("Final Summary")
+    st.write(final_summary)
+    st.success("🚨Cheers!")
+if __name__ == "__main__":
+    main()

pages/5_2_ChromaDB_LangChain_directory.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from langchain_community.vectorstores import Chroma
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
+from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+import streamlit as st, os
+from utils import st_def, utilities
+openai_api_key = st_def.st_sidebar()
+def load_docs(directory):
+  loader = DirectoryLoader(directory)
+  documents = loader.load()
+  return documents
+with st.spinner('Loading files...'):
+  documents = load_docs('data/pets_txt/')
+  file_names = [os.path.basename(doc.metadata['source']) for doc in documents]
+  st.write('\n\n'.join(file_names))
+  def split_docs(documents,chunk_size=1000,chunk_overlap=20):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    docs = text_splitter.split_documents(documents)
+    return docs
+  docs = split_docs(documents)
+  embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+  db = Chroma.from_documents(documents=docs,embedding=embedding_function,)
+  if "messages2" not in st.session_state:
+      st.session_state.messages2 = []      #111
+      st.session_state.messages2.append({"role": "system", 'content': "hi"})
+      st.session_state.messages2.append({"role": "assistant",   "content": "How May I Help You Today💬?"})
+  for message in st.session_state.messages2[1:]:
+      with st.chat_message(message["role"]):  st.markdown(message["content"])     #222
+if prompt := st.chat_input("💬Ask me anything about the documents above!🍦"):
+    with st.chat_message("user"):           st.markdown(prompt)
+    st.session_state.messages2.append({"role": "user", "content": prompt})
+    matching_docs = db.similarity_search(prompt)
+    with st.chat_message("assistant"):
+        st.markdown(matching_docs[0].page_content)
+    st.session_state.messages2.append({"role": "assistant", "content": matching_docs[0].page_content})
+# query = "What are the emotional benefits of owning a pet?"

pages/8_🧹2_Read_PDF.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import streamlit as st
+from utils.st_def import st_logo, st_read_pdf
+st_logo(title = "Welcome 👋 to Text Cleaning!", page_title="Text Cleaning",)
+st_read_pdf()
+#------------------------------------------------------------------------
+import openai, PyPDF2, os, time, pandas as pd
+if 'pdfreader' not in st.session_state:
+    st.error('Load PDF before continue ... ')
+else:
+    page_text=[]     #array for page
+    summary=' '
+    pr = st.session_state['pdfreader']
+    with st.spinner('Loading files...'):
+        for i in range(0,len(pr.pages)):
+            # creating a page object
+            pageObj = pr.pages[i].extract_text()    # extract one page's text
+            pageObj= pageObj.replace('\t\r','')     # tab, enter
+            pageObj= pageObj.replace('\xa0','')     # non-breaking spaces
+            # extracting text from page
+            page_text.append(pageObj)                    # the whole pdf --> txt
+    st.session_state['page_text'] = page_text
+    st.write(page_text)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+streamlit_extras
+openai
+tenacity
+PyPDF2
+chromadb
+pysqlite3-binary