File size: 5,511 Bytes
ea1ba01
 
1c19c94
 
 
ea1ba01
 
 
 
1c19c94
ea1ba01
1c19c94
ea1ba01
 
 
 
 
 
 
1c19c94
 
22ea197
ea1ba01
 
 
 
 
 
 
 
1c19c94
ea1ba01
 
 
 
 
1c19c94
ea1ba01
 
22ea197
ea1ba01
 
 
 
1c19c94
ea1ba01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c19c94
22ea197
1c19c94
 
22ea197
 
1c19c94
22ea197
 
 
 
 
 
 
 
1c19c94
22ea197
1c19c94
 
 
 
ea1ba01
1c19c94
 
 
ea1ba01
1c19c94
 
 
 
ea1ba01
1c19c94
 
ea1ba01
 
 
 
 
1c19c94
 
 
ea1ba01
 
1c19c94
 
 
 
 
 
 
 
22ea197
1c19c94
 
 
 
 
 
 
e735ca8
1c19c94
 
22ea197
1c19c94
 
 
 
8d70ef7
1c19c94
 
 
 
22ea197
1c19c94
8d70ef7
1c19c94
ea1ba01
 
 
 
 
1c19c94
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import streamlit as st
import os
import tempfile
import zipfile
from dotenv import load_dotenv
from langsmith import traceable

from app.chat import initialize_session_state, display_chat_history
from app.data_loader import get_data, load_docs
from app.document_processor import process_documents, save_vector_store_to_supabase, load_vector_store_from_supabase
from app.prompts import sahabat_prompt
from app.db import supabase
from langchain_community.llms import Replicate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_transformers import LongContextReorder

load_dotenv()

# Supabase configuration
BUCKET_NAME = "pnp-bot-storage-archive"
VECTOR_STORE_PREFIX = "vector_store"  # Changed from file name to prefix
DATA_DIR = "data"

@traceable(name="Create RAG Conversational Chain")
def create_conversational_chain(vector_store):
    llm = Replicate(
        model="fauziisyrinapridal/sahabat-ai-v1:afb9fa89fe786362f619fd4fef34bd1f7a4a4da23073d8a6fbf54dcbe458f216",
        model_kwargs={"temperature": 0.1, "top_p": 0.9, "max_new_tokens": 6000}
    )
    
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key='answer'
    )
    
    chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=vector_store.as_retriever(search_kwargs={"k": 6}),
        combine_docs_chain_kwargs={"prompt": sahabat_prompt},
        return_source_documents=True,
        memory=memory
    )
    
    return chain

def reorder_embedding(docs):
    reordering = LongContextReorder()
    return reordering.transform_documents(docs)

def get_latest_data_timestamp(folder):
    latest_time = 0
    for root, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(root, file)
            file_time = os.path.getmtime(path)
            latest_time = max(latest_time, file_time)
    return latest_time

def get_supabase_vector_store_timestamp():
    """Get the timestamp of vector store files in Supabase storage"""
    try:
        response = supabase.storage.from_(BUCKET_NAME).list()
        timestamps = []
        
        for file in response:
            if file['name'].startswith(VECTOR_STORE_PREFIX) and (
                file['name'].endswith('.faiss') or file['name'].endswith('.pkl')
            ):
                timestamps.append(file['updated_at'])
        
        # Return the latest timestamp if both files exist
        if len(timestamps) >= 2:
            return max(timestamps)
        return None
        
    except Exception as e:
        print(f"Error getting Supabase timestamp: {e}")
        return None

def vector_store_is_outdated():
    """Check if vector store needs to be updated based on data folder changes"""
    supabase_timestamp = get_supabase_vector_store_timestamp()
    if supabase_timestamp is None:
        return True
    
    # Convert supabase timestamp to epoch time for comparison
    from datetime import datetime
    supabase_time = datetime.fromisoformat(supabase_timestamp.replace('Z', '+00:00')).timestamp()
    data_time = get_latest_data_timestamp(DATA_DIR)
    
    return data_time > supabase_time

@traceable(name="Main Chatbot RAG App")
def main():
    initialize_session_state()
    get_data()
    
    vector_store = None  # Initialize first
    
    if len(st.session_state['history']) == 0:
        if vector_store_is_outdated():
            with st.spinner("Loading and processing documents..."):
                docs = load_docs()
                if len(docs) > 0:
                    reordered_docs = reorder_embedding(docs)
                    vector_store = process_documents(reordered_docs)
                    
                    # Save to Supabase instead of local storage
                    with st.spinner("Uploading vector store to Supabase..."):
                        success = save_vector_store_to_supabase(vector_store, supabase, BUCKET_NAME, VECTOR_STORE_PREFIX)
                        if success:
                            st.success("Vector store uploaded to Supabase successfully!")
                        else:
                            st.error("Failed to upload vector store to Supabase")
                else:
                    st.warning("No documents found in 'data/' folder. Chatbot can still be used, but without document context.")
                    vector_store = None
        else:
            # Load vector store from Supabase
            with st.spinner("Loading vector store from Supabase..."):
                vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_PREFIX)
                if vector_store:
                    st.success("Vector store loaded from Supabase successfully!")
                else:
                    st.error("Failed to load vector store from Supabase")
    else:
        # Use cached vector store for existing sessions
        vector_store = st.session_state.get('vector_store')
        if vector_store is None:
            # Fallback: load from Supabase if not in session
            vector_store = load_vector_store_from_supabase(supabase, BUCKET_NAME, VECTOR_STORE_PREFIX)
    
    st.session_state['vector_store'] = vector_store
    
    if st.session_state['vector_store'] is not None:
        chain = create_conversational_chain(st.session_state['vector_store'])
        display_chat_history(chain)

if __name__ == "__main__":
    main()