Spaces:
Sleeping
Sleeping
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_huggingface import HuggingFaceEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| import os | |
| import tempfile | |
| import streamlit as st | |
| from langchain.schema import Document | |
| def save_vector_store_to_supabase(vector_store, supabase, bucket_name, file_prefix="vector_store"): | |
| """Save vector store to Supabase storage as separate files.""" | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Save vector store locally first | |
| local_path = os.path.join(temp_dir, "vector_store") | |
| vector_store.save_local(local_path) | |
| # Upload index.faiss | |
| faiss_file = os.path.join(local_path, "index.faiss") | |
| if os.path.exists(faiss_file): | |
| with open(faiss_file, 'rb') as f: | |
| supabase.storage.from_(bucket_name).upload( | |
| f"{file_prefix}_index.faiss", | |
| f, | |
| {"upsert": "true"} | |
| ) | |
| print(f"Uploaded: {file_prefix}_index.faiss") | |
| # Upload index.pkl | |
| pkl_file = os.path.join(local_path, "index.pkl") | |
| if os.path.exists(pkl_file): | |
| with open(pkl_file, 'rb') as f: | |
| supabase.storage.from_(bucket_name).upload( | |
| f"{file_prefix}_index.pkl", | |
| f, | |
| {"upsert": "true"} | |
| ) | |
| print(f"Uploaded: {file_prefix}_index.pkl") | |
| print(f"Vector store uploaded to Supabase bucket: {bucket_name}") | |
| return True | |
| except Exception as e: | |
| print(f"Error uploading vector store to Supabase: {e}") | |
| st.error(f"Error uploading to Supabase: {e}") | |
| return False | |
| def load_vector_store_from_supabase(supabase, bucket_name, file_prefix="vector_store"): | |
| """Load vector store from Supabase storage from separate files.""" | |
| try: | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| local_path = os.path.join(temp_dir, "vector_store") | |
| os.makedirs(local_path, exist_ok=True) | |
| # Download index.faiss | |
| try: | |
| faiss_response = supabase.storage.from_(bucket_name).download(f"{file_prefix}_index.faiss") | |
| faiss_file = os.path.join(local_path, "index.faiss") | |
| with open(faiss_file, 'wb') as f: | |
| f.write(faiss_response) | |
| print(f"Downloaded: {file_prefix}_index.faiss") | |
| except Exception as e: | |
| print(f"Error downloading index.faiss: {e}") | |
| return None | |
| # Download index.pkl | |
| try: | |
| pkl_response = supabase.storage.from_(bucket_name).download(f"{file_prefix}_index.pkl") | |
| pkl_file = os.path.join(local_path, "index.pkl") | |
| with open(pkl_file, 'wb') as f: | |
| f.write(pkl_response) | |
| print(f"Downloaded: {file_prefix}_index.pkl") | |
| except Exception as e: | |
| print(f"Error downloading index.pkl: {e}") | |
| return None | |
| # Load vector store | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="LazarusNLP/all-indo-e5-small-v4", | |
| model_kwargs={"device": "cpu"}, | |
| encode_kwargs={"normalize_embeddings": True} | |
| ) | |
| vector_store = FAISS.load_local( | |
| local_path, | |
| embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| print(f"Vector store loaded from Supabase bucket: {bucket_name}") | |
| return vector_store | |
| except Exception as e: | |
| print(f"Error loading vector store from Supabase: {e}") | |
| st.error(f"Error loading from Supabase: {e}") | |
| return None | |
| def process_documents(docs): | |
| embeddings = HuggingFaceEmbeddings( | |
| model_name="LazarusNLP/all-indo-e5-small-v4", | |
| model_kwargs={"device": "cpu"}, | |
| encode_kwargs={"normalize_embeddings": True} | |
| ) | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1500, | |
| chunk_overlap=300 | |
| ) | |
| text_chunks = text_splitter.split_documents(docs) | |
| vector_store = FAISS.from_documents(text_chunks, embeddings) | |
| return vector_store |