FauziIsyrinApridal commited on
Commit
542a0cb
·
1 Parent(s): 4d1c64a
Files changed (2) hide show
  1. app.py +49 -2
  2. app/document_processor.py +2 -45
app.py CHANGED
@@ -2,9 +2,12 @@ import streamlit as st
2
  import os
3
  from dotenv import load_dotenv
4
  from langsmith import traceable
 
 
 
5
  from app.chat import initialize_session_state, display_chat_history
6
- from app.data_loader import get_data, load_docs
7
- from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase, get_latest_data_timestamp_from_files,get_supabase_vector_store_timestamp, vector_store_is_outdated
8
  from app.prompts import sahabat_prompt
9
  from app.db import supabase
10
  from langchain_community.llms import Replicate
@@ -14,9 +17,53 @@ from langchain_community.document_transformers import LongContextReorder
14
 
15
  load_dotenv()
16
 
 
17
  BUCKET_NAME = "pnp-bot-storage-archive"
18
  VECTOR_STORE_PREFIX = "vector_store"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def reorder_embedding(docs):
22
  """Reorder documents for long context retrieval."""
 
2
  import os
3
  from dotenv import load_dotenv
4
  from langsmith import traceable
5
+ from datetime import datetime
6
+ from typing import List, Dict, Optional
7
+
8
  from app.chat import initialize_session_state, display_chat_history
9
+ from app.data_loader import get_data, list_all_files, load_docs
10
+ from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase
11
  from app.prompts import sahabat_prompt
12
  from app.db import supabase
13
  from langchain_community.llms import Replicate
 
17
 
18
  load_dotenv()
19
 
20
+
21
  BUCKET_NAME = "pnp-bot-storage-archive"
22
  VECTOR_STORE_PREFIX = "vector_store"
23
 
24
+ def get_latest_data_timestamp_from_files(bucket_name: str) -> float:
25
+ """Get the latest timestamp from files in a Supabase storage bucket."""
26
+ files = list_all_files(bucket_name)
27
+ latest_time = 0.0
28
+ for file in files:
29
+ iso_time = file.get("updated_at") or file.get("created_at")
30
+ if iso_time:
31
+ try:
32
+ timestamp = datetime.fromisoformat(iso_time.replace('Z', '+00:00')).timestamp()
33
+ latest_time = max(latest_time, timestamp)
34
+ except Exception as e:
35
+ print(f"Gagal parsing waktu dari {file.get('name')}: {e}")
36
+ return latest_time
37
+
38
+
39
+ def get_supabase_vector_store_timestamp() -> Optional[str]:
40
+ """Get the latest timestamp of vector store files in the Supabase storage."""
41
+ try:
42
+ response = supabase.storage.from_(BUCKET_NAME).list()
43
+ timestamps = []
44
+ for file in response:
45
+ if file["name"].startswith(VECTOR_STORE_PREFIX) and (
46
+ file["name"].endswith(".faiss") or file["name"].endswith(".pkl")
47
+ ):
48
+ timestamps.append(file["updated_at"])
49
+ if len(timestamps) >= 2:
50
+ return max(timestamps)
51
+ return None
52
+ except Exception as e:
53
+ print(f"Error getting Supabase timestamp: {e}")
54
+ return None
55
+
56
+
57
+ def vector_store_is_outdated() -> bool:
58
+ """Check if vector store needs to be updated based on files in Supabase storage."""
59
+ supabase_timestamp = get_supabase_vector_store_timestamp()
60
+ if supabase_timestamp is None:
61
+ return True
62
+ supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
63
+ data_time = get_latest_data_timestamp_from_files("pnp-bot-storage")
64
+
65
+ return data_time > supabase_time
66
+
67
 
68
  def reorder_embedding(docs):
69
  """Reorder documents for long context retrieval."""
app/document_processor.py CHANGED
@@ -4,8 +4,7 @@ from langchain_community.vectorstores import FAISS
4
  import os
5
  import tempfile
6
  import streamlit as st
7
- from datetime import datetime
8
- from data_loader import list_all_files
9
 
10
 
11
  def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"):
@@ -113,46 +112,4 @@ def process_documents(docs):
113
  text_chunks = text_splitter.split_documents(docs)
114
  vector_store = FAISS.from_documents(text_chunks, embeddings)
115
 
116
- return vector_store
117
-
118
- def get_latest_data_timestamp_from_files(bucket_name: str) -> float:
119
- """Get the latest timestamp from files in a Supabase storage bucket."""
120
- files = list_all_files(bucket_name)
121
- latest_time = 0.0
122
- for file in files:
123
- iso_time = file.get("updated_at") or file.get("created_at")
124
- if iso_time:
125
- try:
126
- timestamp = datetime.fromisoformat(iso_time.replace('Z', '+00:00')).timestamp()
127
- latest_time = max(latest_time, timestamp)
128
- except Exception as e:
129
- print(f"Gagal parsing waktu dari {file.get('name')}: {e}")
130
- return latest_time
131
-
132
-
133
- def get_supabase_vector_store_timestamp() -> Optional[str]:
134
- """Get the latest timestamp of vector store files in the Supabase storage."""
135
- try:
136
- response = supabase.storage.from_(BUCKET_NAME).list()
137
- timestamps = []
138
- for file in response:
139
- if file["name"].startswith(VECTOR_STORE_PREFIX) and (
140
- file["name"].endswith(".faiss") or file["name"].endswith(".pkl")
141
- ):
142
- timestamps.append(file["updated_at"])
143
- if len(timestamps) >= 2:
144
- return max(timestamps)
145
- return None
146
- except Exception as e:
147
- print(f"Error getting Supabase timestamp: {e}")
148
- return None
149
-
150
- def vector_store_is_outdated() -> bool:
151
- """Check if vector store needs to be updated based on files in Supabase storage."""
152
- supabase_timestamp = get_supabase_vector_store_timestamp()
153
- if supabase_timestamp is None:
154
- return True
155
- supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
156
- data_time = get_latest_data_timestamp_from_files("pnp-bot-storage")
157
-
158
- return data_time > supabase_time
 
4
  import os
5
  import tempfile
6
  import streamlit as st
7
+ from langchain.schema import Document
 
8
 
9
 
10
  def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"):
 
112
  text_chunks = text_splitter.split_documents(docs)
113
  vector_store = FAISS.from_documents(text_chunks, embeddings)
114
 
115
+ return vector_store