amirulhazym
Feat: Implement V2 API-driven conversational AI system using Gemini 2.5 Flash Lite model
c4d9cb7
# Full Code for: v2_multilingual_api/backend/index_knowledge_base.py
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
raise ValueError("PINECONE_API_KEY not found. Please set it in your .env file.")
print("Loading documents from multilingual knowledge base...")
# This path is relative to the root, so we run it from the root folder
loader = DirectoryLoader('knowledge_base/v2_multilingual/', glob="**/*.md", show_progress=True)
documents = loader.load()
print(f"Loaded {len(documents)} documents.")
print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks.")
print("Initializing MULTILINGUAL embedding model...")
# Use the powerful multilingual model for embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("Initializing Pinecone...")
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "auracart-multilingual-kb"
if index_name not in pc.list_indexes().names():
print(f"Creating new serverless index: {index_name}")
pc.create_index(
name=index_name,
dimension=model.get_sentence_embedding_dimension(), # 768
metric='cosine',
spec=ServerlessSpec(cloud='aws', region='us-east-1')
)
index = pc.Index(index_name)
print("Pinecone index is ready.")
print("Embedding chunks and uploading to Pinecone...")
batch_size = 100
for i in range(0, len(docs), batch_size):
i_end = min(i + batch_size, len(docs))
batch = docs[i:i_end]
texts = [doc.page_content for doc in batch]
metadata = [{"source": doc.metadata.get('source', 'unknown'), "text": doc.page_content} for doc in batch]
embeddings = model.encode(texts).tolist()
ids = [f"doc_{i+j}" for j in range(len(batch))]
index.upsert(vectors=zip(ids, embeddings, metadata))
print(f"Uploaded batch {i // batch_size + 1}")
print("\n--- Multilingual Knowledge Base Indexing Complete ---")