File size: 2,425 Bytes
91ca409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import pandas as pd
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Configurations
DATA_DIR = "data"
PERSIST_DIR = "chroma_storage"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def parse_pages(pages_str):
    if pd.isna(pages_str):
        return []
    try:
        pages = eval(str(pages_str))
        return pages if isinstance(pages, list) else [pages]
    except:
        try:
            return [int(p.strip()) for p in str(pages_str).split(",") if p.strip().isdigit()]
        except:
            return []

def load_csv(filepath: str) -> list:
    df = pd.read_csv(filepath)
    df.columns = [c.strip().lower() for c in df.columns]

    documents = []
    for _, row in df.iterrows():
        text = row.get('contenu', '')
        if pd.isna(text) or not str(text).strip():
            continue

        meta = {
            'doc': (
                row.get('doc').strip()
                if pd.notna(row.get('doc')) and str(row.get('doc')).strip()
                else os.path.splitext(os.path.basename(filepath))[0]
            ),
            'titre': row.get('titre', ''),
            'chapitre': row.get('chapitre', ''),
            'article': row.get('article', ''),
            'sous_titre1': row.get('sous_titre1', ''),
            'sous_titre2': row.get('sous_titre2', ''),
            'sous_titre3': row.get('sous_titre3', ''),
            'pages': ", ".join(map(str, parse_pages(row.get('pages', ''))))  # <- stringify to avoid Chroma error
        }

        documents.append(Document(page_content=str(text).strip(), metadata=meta))

    print(f"πŸ“„ Loaded: {os.path.basename(filepath)} β†’ {len(documents)} documents")
    return documents

def process_files():
    all_docs = []
    for file in os.listdir(DATA_DIR):
        if file.endswith(".csv"):
            full_path = os.path.join(DATA_DIR, file)
            all_docs.extend(load_csv(full_path))

    print(f"πŸ“Š Total: {len(all_docs)} full documents")

    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
    db = Chroma.from_documents(all_docs, embedding=embeddings, persist_directory=PERSIST_DIR)
    db.persist()

    print(f"βœ… Done! Stored in {PERSIST_DIR}")

if __name__ == "__main__":
    process_files()