loimaroc / app /scripts /load_documents.py
aymanemalih's picture
Upload 25 files
91ca409 verified
import os
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# Configurations
DATA_DIR = "data"
PERSIST_DIR = "chroma_storage"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
def parse_pages(pages_str):
if pd.isna(pages_str):
return []
try:
pages = eval(str(pages_str))
return pages if isinstance(pages, list) else [pages]
except:
try:
return [int(p.strip()) for p in str(pages_str).split(",") if p.strip().isdigit()]
except:
return []
def load_csv(filepath: str) -> list:
df = pd.read_csv(filepath)
df.columns = [c.strip().lower() for c in df.columns]
documents = []
for _, row in df.iterrows():
text = row.get('contenu', '')
if pd.isna(text) or not str(text).strip():
continue
meta = {
'doc': (
row.get('doc').strip()
if pd.notna(row.get('doc')) and str(row.get('doc')).strip()
else os.path.splitext(os.path.basename(filepath))[0]
),
'titre': row.get('titre', ''),
'chapitre': row.get('chapitre', ''),
'article': row.get('article', ''),
'sous_titre1': row.get('sous_titre1', ''),
'sous_titre2': row.get('sous_titre2', ''),
'sous_titre3': row.get('sous_titre3', ''),
'pages': ", ".join(map(str, parse_pages(row.get('pages', '')))) # <- stringify to avoid Chroma error
}
documents.append(Document(page_content=str(text).strip(), metadata=meta))
print(f"πŸ“„ Loaded: {os.path.basename(filepath)} β†’ {len(documents)} documents")
return documents
def process_files():
all_docs = []
for file in os.listdir(DATA_DIR):
if file.endswith(".csv"):
full_path = os.path.join(DATA_DIR, file)
all_docs.extend(load_csv(full_path))
print(f"πŸ“Š Total: {len(all_docs)} full documents")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
db = Chroma.from_documents(all_docs, embedding=embeddings, persist_directory=PERSIST_DIR)
db.persist()
print(f"βœ… Done! Stored in {PERSIST_DIR}")
if __name__ == "__main__":
process_files()