FauziIsyrinApridal commited on
Commit
1c19c94
·
1 Parent(s): 40eca06

update penyimpanan vectore_store ke supabase

Browse files
Files changed (2) hide show
  1. app.py +68 -30
  2. app/document_processor.py +74 -24
app.py CHANGED
@@ -1,12 +1,16 @@
 
1
  import streamlit as st
2
  import os
3
- from dotenv import load_dotenv
 
 
4
  from langsmith import traceable
5
 
6
  from app.chat import initialize_session_state, display_chat_history
7
  from app.data_loader import get_data, load_docs
8
- from app.document_processor import process_documents, save_vector_store, load_vector_store
9
  from app.prompts import sahabat_prompt
 
10
  from langchain_community.llms import Replicate
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
@@ -14,7 +18,9 @@ from langchain_community.document_transformers import LongContextReorder
14
 
15
  load_dotenv()
16
 
17
- VECTOR_STORE_PATH = "vector_store_data"
 
 
18
  DATA_DIR = "data"
19
 
20
  @traceable(name="Create RAG Conversational Chain")
@@ -23,21 +29,21 @@ def create_conversational_chain(vector_store):
23
  model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
24
  model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
25
  )
26
-
27
  memory = ConversationBufferMemory(
28
  memory_key="chat_history",
29
  return_messages=True,
30
  output_key='answer'
31
  )
32
-
33
  chain = ConversationalRetrievalChain.from_llm(
34
  llm,
35
- retriever=vector_store.as_retriever(search_kwargs={"k": 6}),
36
  combine_docs_chain_kwargs={"prompt": sahabat_prompt},
37
  return_source_documents=True,
38
  memory=memory
39
  )
40
-
41
  return chain
42
 
43
  def reorder_embedding(docs):
@@ -53,44 +59,76 @@ def get_latest_data_timestamp(folder):
53
  latest_time = max(latest_time, file_time)
54
  return latest_time
55
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def vector_store_is_outdated():
57
- if not os.path.exists(VECTOR_STORE_PATH):
 
 
58
  return True
59
- vector_store_time = os.path.getmtime(VECTOR_STORE_PATH)
 
 
 
60
  data_time = get_latest_data_timestamp(DATA_DIR)
61
- return data_time > vector_store_time
 
62
 
63
  @traceable(name="Main Chatbot RAG App")
64
  def main():
65
  initialize_session_state()
66
  get_data()
67
-
68
- vector_store = None # Inisialisasi dulu
69
-
70
  if len(st.session_state['history']) == 0:
71
  if vector_store_is_outdated():
72
- docs = load_docs()
73
- if len(docs) > 0:
74
- reordered_docs = reorder_embedding(docs)
75
- vector_store = process_documents(reordered_docs)
76
- save_vector_store(vector_store)
77
- else:
78
- st.warning("Tidak ada dokumen ditemukan di folder 'data/'. Chatbot tetap bisa digunakan, tapi tanpa konteks dokumen.")
79
- vector_store = None
 
 
 
 
 
 
 
 
80
  else:
81
- # Jika vector_store tidak outdated dan history kosong,
82
- # harus tetap load vector_store dari penyimpanan
83
- vector_store = load_vector_store()
 
 
 
 
84
  else:
85
- vector_store = load_vector_store()
86
-
 
 
 
 
87
  st.session_state['vector_store'] = vector_store
88
-
89
  if st.session_state['vector_store'] is not None:
90
  chain = create_conversational_chain(st.session_state['vector_store'])
91
  display_chat_history(chain)
92
 
93
-
94
  if __name__ == "__main__":
95
- main()
96
-
 
1
+
2
  import streamlit as st
3
  import os
4
+ import tempfile
5
+ import zipfile
6
+ from dotenv import load_dotenv
7
  from langsmith import traceable
8
 
9
  from app.chat import initialize_session_state, display_chat_history
10
  from app.data_loader import get_data, load_docs
11
+ from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase
12
  from app.prompts import sahabat_prompt
13
+ from app.db import supabase
14
  from langchain_community.llms import Replicate
15
  from langchain.memory import ConversationBufferMemory
16
  from langchain.chains import ConversationalRetrievalChain
 
18
 
19
  load_dotenv()
20
 
21
+ # Supabase configuration
22
+ BUCKET_NAME = "pnp-bot-storage-archive"
23
+ VECTOR_STORE_FILE = "vector_store.zip"
24
  DATA_DIR = "data"
25
 
26
  @traceable(name="Create RAG Conversational Chain")
 
29
  model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
30
  model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
31
  )
32
+
33
  memory = ConversationBufferMemory(
34
  memory_key="chat_history",
35
  return_messages=True,
36
  output_key='answer'
37
  )
38
+
39
  chain = ConversationalRetrievalChain.from_llm(
40
  llm,
41
+ retriever=vector_store.as_retriever(search_kwargs={"k": 10}),
42
  combine_docs_chain_kwargs={"prompt": sahabat_prompt},
43
  return_source_documents=True,
44
  memory=memory
45
  )
46
+
47
  return chain
48
 
49
  def reorder_embedding(docs):
 
59
  latest_time = max(latest_time, file_time)
60
  return latest_time
61
 
62
+ def get_supabase_vector_store_timestamp():
63
+ """Get the timestamp of vector store in Supabase storage"""
64
+ try:
65
+ response = supabase.storage.from_(BUCKET_NAME).list()
66
+ for file in response:
67
+ if file['name'] == VECTOR_STORE_FILE:
68
+ return file['updated_at']
69
+ return None
70
+ except Exception as e:
71
+ print(f"Error getting Supabase timestamp: {e}")
72
+ return None
73
+
74
  def vector_store_is_outdated():
75
+ """Check if vector store needs to be updated based on data folder changes"""
76
+ supabase_timestamp = get_supabase_vector_store_timestamp()
77
+ if supabase_timestamp is None:
78
  return True
79
+
80
+ # Convert supabase timestamp to epoch time for comparison
81
+ from datetime import datetime
82
+ supabase_time = datetime.fromisoformat(supabase_timestamp.replace('Z', '+00:00')).timestamp()
83
  data_time = get_latest_data_timestamp(DATA_DIR)
84
+
85
+ return data_time > supabase_time
86
 
87
  @traceable(name="Main Chatbot RAG App")
88
  def main():
89
  initialize_session_state()
90
  get_data()
91
+
92
+ vector_store = None # Initialize first
93
+
94
  if len(st.session_state['history']) == 0:
95
  if vector_store_is_outdated():
96
+ with st.spinner("Loading and processing documents..."):
97
+ docs = load_docs()
98
+ if len(docs) > 0:
99
+ reordered_docs = reorder_embedding(docs)
100
+ vector_store = process_documents(reordered_docs)
101
+
102
+ # Save to Supabase instead of local storage
103
+ with st.spinner("Uploading vector store to Supabase..."):
104
+ success = save_vector_store_to_supabase(vector_store, supabase, BUCKET_NAME, VECTOR_STORE_FILE)
105
+ if success:
106
+ st.success("Vector store uploaded to Supabase successfully!")
107
+ else:
108
+ st.error("Failed to upload vector store to Supabase")
109
+ else:
110
+ st.warning("No documents found in 'data/' folder. Chatbot can still be used, but without document context.")
111
+ vector_store = None
112
  else:
113
+ # Load vector store from Supabase
114
+ with st.spinner("Loading vector store from Supabase..."):
115
+ vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_FILE)
116
+ if vector_store:
117
+ st.success("Vector store loaded from Supabase successfully!")
118
+ else:
119
+ st.error("Failed to load vector store from Supabase")
120
  else:
121
+ # Use cached vector store for existing sessions
122
+ vector_store = st.session_state.get('vector_store')
123
+ if vector_store is None:
124
+ # Fallback: load from Supabase if not in session
125
+ vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_FILE)
126
+
127
  st.session_state['vector_store'] = vector_store
128
+
129
  if st.session_state['vector_store'] is not None:
130
  chain = create_conversational_chain(st.session_state['vector_store'])
131
  display_chat_history(chain)
132
 
 
133
  if __name__ == "__main__":
134
+ main()
 
app/document_processor.py CHANGED
@@ -2,37 +2,88 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
  from langchain_community.vectorstores import FAISS
4
  import os
 
 
 
5
 
6
- VECTOR_STORE_PATH = "vector_store_data"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def save_vector_store(vector_store):
9
- """Simpan vector store ke file."""
10
- vector_store.save_local(VECTOR_STORE_PATH)
11
- print(f"Vector store saved to {VECTOR_STORE_PATH}")
12
-
13
- def load_vector_store():
14
- """Muat vector store dari file, atau return None kalau file tidak ada."""
15
- if os.path.exists(VECTOR_STORE_PATH):
16
- embeddings = HuggingFaceEmbeddings(
17
- model_name="LazarusNLP/all-indo-e5-small-v4",
18
- model_kwargs={"device": "cpu"},
19
- encode_kwargs={"normalize_embeddings": True}
20
- )
21
- vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
22
- print(f"Vector store loaded from {VECTOR_STORE_PATH}")
23
- return vector_store
24
- else:
25
- print("Vector store file not found.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return None
27
 
28
-
29
  def process_documents(docs):
30
  embeddings = HuggingFaceEmbeddings(
31
  model_name="LazarusNLP/all-indo-e5-small-v4",
32
- model_kwargs={"device": "cpu"},
33
  encode_kwargs={"normalize_embeddings": True}
34
  )
35
-
36
  text_splitter = RecursiveCharacterTextSplitter(
37
  chunk_size=1500,
38
  chunk_overlap=300
@@ -40,5 +91,4 @@ def process_documents(docs):
40
  text_chunks = text_splitter.split_documents(docs)
41
  vector_store = FAISS.from_documents(text_chunks, embeddings)
42
 
43
- return vector_store
44
-
 
2
  from langchain_huggingface import HuggingFaceEmbeddings
3
  from langchain_community.vectorstores import FAISS
4
  import os
5
+ import tempfile
6
+ import zipfile
7
+ import streamlit as st
8
 
9
+ def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_name):
10
+ """Save vector store to Supabase storage as a zip file."""
11
+ try:
12
+ with tempfile.TemporaryDirectory() as temp_dir:
13
+ # Save vector store locally first
14
+ local_path = os.path.join(temp_dir, "vector_store")
15
+ vector_store.save_local(local_path)
16
+
17
+ # Create zip file
18
+ zip_path = os.path.join(temp_dir, "vector_store.zip")
19
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
20
+ for root, dirs, files in os.walk(local_path):
21
+ for file in files:
22
+ file_path = os.path.join(root, file)
23
+ arc_name = os.path.relpath(file_path, local_path)
24
+ zipf.write(file_path, arc_name)
25
+
26
+ # Upload to Supabase
27
+ with open(zip_path, 'rb') as f:
28
+ response = supabase.storage.from_(bucket_name).upload(file_name, f, {"upsert": "true"})
29
+
30
+ print(f"Vector store uploaded to Supabase: {bucket_name}/{file_name}")
31
+ return True
32
+
33
+ except Exception as e:
34
+ print(f"Error uploading vector store to Supabase: {e}")
35
+ st.error(f"Error uploading to Supabase: {e}")
36
+ return False
37
 
38
+ def load_vector_store_from_supabase(supabase, bucket_name, file_name):
39
+ """Load vector store from Supabase storage."""
40
+ try:
41
+ # Download from Supabase
42
+ response = supabase.storage.from_(bucket_name).download(file_name)
43
+
44
+ if not response:
45
+ print("Vector store file not found in Supabase.")
46
+ return None
47
+
48
+ with tempfile.TemporaryDirectory() as temp_dir:
49
+ # Save downloaded zip file
50
+ zip_path = os.path.join(temp_dir, "vector_store.zip")
51
+ with open(zip_path, 'wb') as f:
52
+ f.write(response)
53
+
54
+ # Extract zip file
55
+ extract_path = os.path.join(temp_dir, "vector_store")
56
+ with zipfile.ZipFile(zip_path, 'r') as zipf:
57
+ zipf.extractall(extract_path)
58
+
59
+ # Load vector store
60
+ embeddings = HuggingFaceEmbeddings(
61
+ model_name="LazarusNLP/all-indo-e5-small-v4",
62
+ model_kwargs={"device": "cpu"},
63
+ encode_kwargs={"normalize_embeddings": True}
64
+ )
65
+
66
+ vector_store = FAISS.load_local(
67
+ extract_path,
68
+ embeddings,
69
+ allow_dangerous_deserialization=True
70
+ )
71
+
72
+ print(f"Vector store loaded from Supabase: {bucket_name}/{file_name}")
73
+ return vector_store
74
+
75
+ except Exception as e:
76
+ print(f"Error loading vector store from Supabase: {e}")
77
+ st.error(f"Error loading from Supabase: {e}")
78
  return None
79
 
 
80
  def process_documents(docs):
81
  embeddings = HuggingFaceEmbeddings(
82
  model_name="LazarusNLP/all-indo-e5-small-v4",
83
+ model_kwargs={"device": "cpu"},
84
  encode_kwargs={"normalize_embeddings": True}
85
  )
86
+
87
  text_splitter = RecursiveCharacterTextSplitter(
88
  chunk_size=1500,
89
  chunk_overlap=300
 
91
  text_chunks = text_splitter.split_documents(docs)
92
  vector_store = FAISS.from_documents(text_chunks, embeddings)
93
 
94
+ return vector_store