FauziIsyrinApridal commited on
Commit
4d1c64a
·
1 Parent(s): 847786a

clean code

Browse files
Files changed (2) hide show
  1. app.py +2 -61
  2. app/document_processor.py +45 -2
app.py CHANGED
@@ -2,12 +2,9 @@ import streamlit as st
2
  import os
3
  from dotenv import load_dotenv
4
  from langsmith import traceable
5
- from datetime import datetime
6
- from typing import List, Dict, Optional
7
-
8
  from app.chat import initialize_session_state, display_chat_history
9
- from app.data_loader import get_data, list_all_files, load_docs
10
- from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase
11
  from app.prompts import sahabat_prompt
12
  from app.db import supabase
13
  from langchain_community.llms import Replicate
@@ -17,58 +14,9 @@ from langchain_community.document_transformers import LongContextReorder
17
 
18
  load_dotenv()
19
 
20
- # ---------------------------------------------------------
21
- # ⚡️ CONFIG
22
- # ---------------------------------------------------------
23
  BUCKET_NAME = "pnp-bot-storage-archive"
24
  VECTOR_STORE_PREFIX = "vector_store"
25
 
26
- # ---------------------------------------------------------
27
- # ⚡️ UTILITY
28
- # ---------------------------------------------------------
29
- def get_latest_data_timestamp_from_files(bucket_name: str) -> float:
30
- """Get the latest timestamp from files in a Supabase storage bucket."""
31
- files = list_all_files(bucket_name)
32
- latest_time = 0.0
33
- for file in files:
34
- iso_time = file.get("updated_at") or file.get("created_at")
35
- if iso_time:
36
- try:
37
- timestamp = datetime.fromisoformat(iso_time.replace('Z', '+00:00')).timestamp()
38
- latest_time = max(latest_time, timestamp)
39
- except Exception as e:
40
- print(f"Gagal parsing waktu dari {file.get('name')}: {e}")
41
- return latest_time
42
-
43
-
44
- def get_supabase_vector_store_timestamp() -> Optional[str]:
45
- """Get the latest timestamp of vector store files in the Supabase storage."""
46
- try:
47
- response = supabase.storage.from_(BUCKET_NAME).list()
48
- timestamps = []
49
- for file in response:
50
- if file["name"].startswith(VECTOR_STORE_PREFIX) and (
51
- file["name"].endswith(".faiss") or file["name"].endswith(".pkl")
52
- ):
53
- timestamps.append(file["updated_at"])
54
- if len(timestamps) >= 2:
55
- return max(timestamps)
56
- return None
57
- except Exception as e:
58
- print(f"Error getting Supabase timestamp: {e}")
59
- return None
60
-
61
-
62
- def vector_store_is_outdated() -> bool:
63
- """Check if vector store needs to be updated based on files in Supabase storage."""
64
- supabase_timestamp = get_supabase_vector_store_timestamp()
65
- if supabase_timestamp is None:
66
- return True
67
- supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
68
- data_time = get_latest_data_timestamp_from_files("pnp-bot-storage")
69
-
70
- return data_time > supabase_time
71
-
72
 
73
  def reorder_embedding(docs):
74
  """Reorder documents for long context retrieval."""
@@ -76,9 +24,6 @@ def reorder_embedding(docs):
76
  return reordering.transform_documents(docs)
77
 
78
 
79
- # ---------------------------------------------------------
80
- # ⚡️ RAG CHAIN
81
- # ---------------------------------------------------------
82
  @traceable(name="Create RAG Conversational Chain")
83
  def create_conversational_chain(vector_store):
84
  """Create a Conversational Retrieval Chain for RAG."""
@@ -101,10 +46,6 @@ def get_rag_chain(vector_store):
101
  """Return a Conversational Retrieval Chain for external use."""
102
  return create_conversational_chain(vector_store)
103
 
104
-
105
- # ---------------------------------------------------------
106
- # ⚡️ MAIN FUNCTION
107
- # ---------------------------------------------------------
108
  @traceable(name="Main Chatbot RAG App")
109
  def main():
110
  initialize_session_state()
 
2
  import os
3
  from dotenv import load_dotenv
4
  from langsmith import traceable
 
 
 
5
  from app.chat import initialize_session_state, display_chat_history
6
+ from app.data_loader import get_data, load_docs
7
+ from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase, get_latest_data_timestamp_from_files,get_supabase_vector_store_timestamp, vector_store_is_outdated
8
  from app.prompts import sahabat_prompt
9
  from app.db import supabase
10
  from langchain_community.llms import Replicate
 
14
 
15
  load_dotenv()
16
 
 
 
 
17
  BUCKET_NAME = "pnp-bot-storage-archive"
18
  VECTOR_STORE_PREFIX = "vector_store"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def reorder_embedding(docs):
22
  """Reorder documents for long context retrieval."""
 
24
  return reordering.transform_documents(docs)
25
 
26
 
 
 
 
27
  @traceable(name="Create RAG Conversational Chain")
28
  def create_conversational_chain(vector_store):
29
  """Create a Conversational Retrieval Chain for RAG."""
 
46
  """Return a Conversational Retrieval Chain for external use."""
47
  return create_conversational_chain(vector_store)
48
 
 
 
 
 
49
  @traceable(name="Main Chatbot RAG App")
50
  def main():
51
  initialize_session_state()
app/document_processor.py CHANGED
@@ -4,7 +4,8 @@ from langchain_community.vectorstores import FAISS
4
  import os
5
  import tempfile
6
  import streamlit as st
7
- from langchain.schema import Document
 
8
 
9
 
10
  def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"):
@@ -112,4 +113,46 @@ def process_documents(docs):
112
  text_chunks = text_splitter.split_documents(docs)
113
  vector_store = FAISS.from_documents(text_chunks, embeddings)
114
 
115
- return vector_store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import os
5
  import tempfile
6
  import streamlit as st
7
+ from datetime import datetime
8
+ from data_loader import list_all_files
9
 
10
 
11
  def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"):
 
113
  text_chunks = text_splitter.split_documents(docs)
114
  vector_store = FAISS.from_documents(text_chunks, embeddings)
115
 
116
+ return vector_store
117
+
118
+ def get_latest_data_timestamp_from_files(bucket_name: str) -> float:
119
+ """Get the latest timestamp from files in a Supabase storage bucket."""
120
+ files = list_all_files(bucket_name)
121
+ latest_time = 0.0
122
+ for file in files:
123
+ iso_time = file.get("updated_at") or file.get("created_at")
124
+ if iso_time:
125
+ try:
126
+ timestamp = datetime.fromisoformat(iso_time.replace('Z', '+00:00')).timestamp()
127
+ latest_time = max(latest_time, timestamp)
128
+ except Exception as e:
129
+ print(f"Gagal parsing waktu dari {file.get('name')}: {e}")
130
+ return latest_time
131
+
132
+
133
+ def get_supabase_vector_store_timestamp() -> Optional[str]:
134
+ """Get the latest timestamp of vector store files in the Supabase storage."""
135
+ try:
136
+ response = supabase.storage.from_(BUCKET_NAME).list()
137
+ timestamps = []
138
+ for file in response:
139
+ if file["name"].startswith(VECTOR_STORE_PREFIX) and (
140
+ file["name"].endswith(".faiss") or file["name"].endswith(".pkl")
141
+ ):
142
+ timestamps.append(file["updated_at"])
143
+ if len(timestamps) >= 2:
144
+ return max(timestamps)
145
+ return None
146
+ except Exception as e:
147
+ print(f"Error getting Supabase timestamp: {e}")
148
+ return None
149
+
150
+ def vector_store_is_outdated() -> bool:
151
+ """Check if vector store needs to be updated based on files in Supabase storage."""
152
+ supabase_timestamp = get_supabase_vector_store_timestamp()
153
+ if supabase_timestamp is None:
154
+ return True
155
+ supabase_time = datetime.fromisoformat(supabase_timestamp.replace("Z", "+00:00")).timestamp()
156
+ data_time = get_latest_data_timestamp_from_files("pnp-bot-storage")
157
+
158
+ return data_time > supabase_time