Spaces:

aymanemalih
/

loimaroc

Runtime error

File size: 2,425 Bytes

91ca409

import os
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Configurations
DATA_DIR = "data"
PERSIST_DIR = "chroma_storage"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def parse_pages(pages_str):
    if pd.isna(pages_str):
        return []
    try:
        pages = eval(str(pages_str))
        return pages if isinstance(pages, list) else [pages]
    except:
        try:
            return [int(p.strip()) for p in str(pages_str).split(",") if p.strip().isdigit()]
        except:
            return []

def load_csv(filepath: str) -> list:
    df = pd.read_csv(filepath)
    df.columns = [c.strip().lower() for c in df.columns]

    documents = []
    for _, row in df.iterrows():
        text = row.get('contenu', '')
        if pd.isna(text) or not str(text).strip():
            continue

        meta = {
            'doc': (
                row.get('doc').strip()
                if pd.notna(row.get('doc')) and str(row.get('doc')).strip()
                else os.path.splitext(os.path.basename(filepath))[0]
            ),
            'titre': row.get('titre', ''),
            'chapitre': row.get('chapitre', ''),
            'article': row.get('article', ''),
            'sous_titre1': row.get('sous_titre1', ''),
            'sous_titre2': row.get('sous_titre2', ''),
            'sous_titre3': row.get('sous_titre3', ''),
            'pages': ", ".join(map(str, parse_pages(row.get('pages', ''))))  # <- stringify to avoid Chroma error
        }

        documents.append(Document(page_content=str(text).strip(), metadata=meta))

    print(f"📄 Loaded: {os.path.basename(filepath)} → {len(documents)} documents")
    return documents

def process_files():
    all_docs = []
    for file in os.listdir(DATA_DIR):
        if file.endswith(".csv"):
            full_path = os.path.join(DATA_DIR, file)
            all_docs.extend(load_csv(full_path))

    print(f"📊 Total: {len(all_docs)} full documents")

    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = Chroma.from_documents(all_docs, embedding=embeddings, persist_directory=PERSIST_DIR)
    db.persist()

    print(f"✅ Done! Stored in {PERSIST_DIR}")

if __name__ == "__main__":
    process_files()