Spaces:

Yozora721
/

pnp-chatbot-v1

Running

App Files Files Community

FauziIsyrinApridal commited on May 28

Commit

1c19c94

1 Parent(s): 40eca06

update penyimpanan vectore_store ke supabase

Browse files

Files changed (2) hide show

app.py +68 -30
app/document_processor.py +74 -24

app.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import streamlit as st
 import os
-from dotenv import load_dotenv
 from langsmith import traceable
 from app.chat import initialize_session_state, display_chat_history
 from app.data_loader import get_data, load_docs
-from app.document_processor import process_documents, save_vector_store, load_vector_store
 from app.prompts import sahabat_prompt
 from langchain_community.llms import Replicate
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
@@ -14,7 +18,9 @@ from langchain_community.document_transformers import LongContextReorder
 load_dotenv()
-VECTOR_STORE_PATH = "vector_store_data"
 DATA_DIR = "data"
 @traceable(name="Create RAG Conversational Chain")
@@ -23,21 +29,21 @@ def create_conversational_chain(vector_store):
         model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
         model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
     )
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True,
         output_key='answer'
     )
     chain = ConversationalRetrievalChain.from_llm(
         llm,
-        retriever=vector_store.as_retriever(search_kwargs={"k": 6}),
         combine_docs_chain_kwargs={"prompt": sahabat_prompt},
         return_source_documents=True,
         memory=memory
     )
     return chain
 def reorder_embedding(docs):
@@ -53,44 +59,76 @@ def get_latest_data_timestamp(folder):
             latest_time = max(latest_time, file_time)
     return latest_time
 def vector_store_is_outdated():
-    if not os.path.exists(VECTOR_STORE_PATH):
         return True
-    vector_store_time = os.path.getmtime(VECTOR_STORE_PATH)
     data_time = get_latest_data_timestamp(DATA_DIR)
-    return data_time > vector_store_time
 @traceable(name="Main Chatbot RAG App")
 def main():
     initialize_session_state()
     get_data()
-    vector_store = None  # Inisialisasi dulu
     if len(st.session_state['history']) == 0:
         if vector_store_is_outdated():
-            docs = load_docs()
-            if len(docs) > 0:
-                reordered_docs = reorder_embedding(docs)
-                vector_store = process_documents(reordered_docs)
-                save_vector_store(vector_store)
-            else:
-                st.warning("Tidak ada dokumen ditemukan di folder 'data/'. Chatbot tetap bisa digunakan, tapi tanpa konteks dokumen.")
-                vector_store = None
         else:
-            # Jika vector_store tidak outdated dan history kosong,
-            # harus tetap load vector_store dari penyimpanan
-            vector_store = load_vector_store()
     else:
-        vector_store = load_vector_store()
     st.session_state['vector_store'] = vector_store
     if st.session_state['vector_store'] is not None:
         chain = create_conversational_chain(st.session_state['vector_store'])
         display_chat_history(chain)
 if __name__ == "__main__":
-    main()

 import streamlit as st
 import os
+import tempfile
+import zipfile
+from dotenv import load_dotenv
 from langsmith import traceable
 from app.chat import initialize_session_state, display_chat_history
 from app.data_loader import get_data, load_docs
+from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase
 from app.prompts import sahabat_prompt
+from app.db import supabase
 from langchain_community.llms import Replicate
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 load_dotenv()
+# Supabase configuration
+BUCKET_NAME = "pnp-bot-storage-archive"
+VECTOR_STORE_FILE = "vector_store.zip"
 DATA_DIR = "data"
 @traceable(name="Create RAG Conversational Chain")
         model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
         model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
     )
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True,
         output_key='answer'
     )
     chain = ConversationalRetrievalChain.from_llm(
         llm,
+        retriever=vector_store.as_retriever(search_kwargs={"k": 10}),
         combine_docs_chain_kwargs={"prompt": sahabat_prompt},
         return_source_documents=True,
         memory=memory
     )
     return chain
 def reorder_embedding(docs):
             latest_time = max(latest_time, file_time)
     return latest_time
+def get_supabase_vector_store_timestamp():
+    """Get the timestamp of vector store in Supabase storage"""
+    try:
+        response = supabase.storage.from_(BUCKET_NAME).list()
+        for file in response:
+            if file['name'] == VECTOR_STORE_FILE:
+                return file['updated_at']
+        return None
+    except Exception as e:
+        print(f"Error getting Supabase timestamp: {e}")
+        return None
 def vector_store_is_outdated():
+    """Check if vector store needs to be updated based on data folder changes"""
+    supabase_timestamp = get_supabase_vector_store_timestamp()
+    if supabase_timestamp is None:
         return True
+    # Convert supabase timestamp to epoch time for comparison
+    from datetime import datetime
+    supabase_time = datetime.fromisoformat(supabase_timestamp.replace('Z', '+00:00')).timestamp()
     data_time = get_latest_data_timestamp(DATA_DIR)
+    return data_time > supabase_time
 @traceable(name="Main Chatbot RAG App")
 def main():
     initialize_session_state()
     get_data()
+    vector_store = None  # Initialize first
     if len(st.session_state['history']) == 0:
         if vector_store_is_outdated():
+            with st.spinner("Loading and processing documents..."):
+                docs = load_docs()
+                if len(docs) > 0:
+                    reordered_docs = reorder_embedding(docs)
+                    vector_store = process_documents(reordered_docs)
+                    # Save to Supabase instead of local storage
+                    with st.spinner("Uploading vector store to Supabase..."):
+                        success = save_vector_store_to_supabase(vector_store, supabase, BUCKET_NAME, VECTOR_STORE_FILE)
+                        if success:
+                            st.success("Vector store uploaded to Supabase successfully!")
+                        else:
+                            st.error("Failed to upload vector store to Supabase")
+                else:
+                    st.warning("No documents found in 'data/' folder. Chatbot can still be used, but without document context.")
+                    vector_store = None
         else:
+            # Load vector store from Supabase
+            with st.spinner("Loading vector store from Supabase..."):
+                vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_FILE)
+                if vector_store:
+                    st.success("Vector store loaded from Supabase successfully!")
+                else:
+                    st.error("Failed to load vector store from Supabase")
     else:
+        # Use cached vector store for existing sessions
+        vector_store = st.session_state.get('vector_store')
+        if vector_store is None:
+            # Fallback: load from Supabase if not in session
+            vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_FILE)
     st.session_state['vector_store'] = vector_store
     if st.session_state['vector_store'] is not None:
         chain = create_conversational_chain(st.session_state['vector_store'])
         display_chat_history(chain)
 if __name__ == "__main__":
+    main()

app/document_processor.py CHANGED Viewed

@@ -2,37 +2,88 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 import os
-VECTOR_STORE_PATH = "vector_store_data"
-def save_vector_store(vector_store):
-    """Simpan vector store ke file."""
-    vector_store.save_local(VECTOR_STORE_PATH)
-    print(f"Vector store saved to {VECTOR_STORE_PATH}")
-def load_vector_store():
-    """Muat vector store dari file, atau return None kalau file tidak ada."""
-    if os.path.exists(VECTOR_STORE_PATH):
-        embeddings = HuggingFaceEmbeddings(
-            model_name="LazarusNLP/all-indo-e5-small-v4",
-            model_kwargs={"device": "cpu"},
-            encode_kwargs={"normalize_embeddings": True}
-        )
-        vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
-        print(f"Vector store loaded from {VECTOR_STORE_PATH}")
-        return vector_store
-    else:
-        print("Vector store file not found.")
         return None
 def process_documents(docs):
     embeddings = HuggingFaceEmbeddings(
         model_name="LazarusNLP/all-indo-e5-small-v4",
-        model_kwargs={"device": "cpu"},
         encode_kwargs={"normalize_embeddings": True}
     )
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1500,
         chunk_overlap=300
@@ -40,5 +91,4 @@ def process_documents(docs):
     text_chunks = text_splitter.split_documents(docs)
     vector_store = FAISS.from_documents(text_chunks, embeddings)
-    return vector_store

 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 import os
+import tempfile
+import zipfile
+import streamlit as st
+def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_name):
+    """Save vector store to Supabase storage as a zip file."""
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save vector store locally first
+            local_path = os.path.join(temp_dir, "vector_store")
+            vector_store.save_local(local_path)
+            # Create zip file
+            zip_path = os.path.join(temp_dir, "vector_store.zip")
+            with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                for root, dirs, files in os.walk(local_path):
+                    for file in files:
+                        file_path = os.path.join(root, file)
+                        arc_name = os.path.relpath(file_path, local_path)
+                        zipf.write(file_path, arc_name)
+            # Upload to Supabase
+            with open(zip_path, 'rb') as f:
+                response = supabase.storage.from_(bucket_name).upload(file_name, f, {"upsert": "true"})
+            print(f"Vector store uploaded to Supabase: {bucket_name}/{file_name}")
+            return True
+    except Exception as e:
+        print(f"Error uploading vector store to Supabase: {e}")
+        st.error(f"Error uploading to Supabase: {e}")
+        return False
+def load_vector_store_from_supabase(supabase, bucket_name, file_name):
+    """Load vector store from Supabase storage."""
+    try:
+        # Download from Supabase
+        response = supabase.storage.from_(bucket_name).download(file_name)
+        if not response:
+            print("Vector store file not found in Supabase.")
+            return None
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save downloaded zip file
+            zip_path = os.path.join(temp_dir, "vector_store.zip")
+            with open(zip_path, 'wb') as f:
+                f.write(response)
+            # Extract zip file
+            extract_path = os.path.join(temp_dir, "vector_store")
+            with zipfile.ZipFile(zip_path, 'r') as zipf:
+                zipf.extractall(extract_path)
+            # Load vector store
+            embeddings = HuggingFaceEmbeddings(
+                model_name="LazarusNLP/all-indo-e5-small-v4",
+                model_kwargs={"device": "cpu"},
+                encode_kwargs={"normalize_embeddings": True}
+            )
+            vector_store = FAISS.load_local(
+                extract_path,
+                embeddings,
+                allow_dangerous_deserialization=True
+            )
+            print(f"Vector store loaded from Supabase: {bucket_name}/{file_name}")
+            return vector_store
+    except Exception as e:
+        print(f"Error loading vector store from Supabase: {e}")
+        st.error(f"Error loading from Supabase: {e}")
         return None
 def process_documents(docs):
     embeddings = HuggingFaceEmbeddings(
         model_name="LazarusNLP/all-indo-e5-small-v4",
+        model_kwargs={"device": "cpu"},
         encode_kwargs={"normalize_embeddings": True}
     )
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1500,
         chunk_overlap=300
     text_chunks = text_splitter.split_documents(docs)
     vector_store = FAISS.from_documents(text_chunks, embeddings)
+    return vector_store