Spaces:
Runtime error
Runtime error
import os | |
import pandas as pd | |
from langchain.docstore.document import Document | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
# Configurations | |
DATA_DIR = "data" | |
PERSIST_DIR = "chroma_storage" | |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
def parse_pages(pages_str): | |
if pd.isna(pages_str): | |
return [] | |
try: | |
pages = eval(str(pages_str)) | |
return pages if isinstance(pages, list) else [pages] | |
except: | |
try: | |
return [int(p.strip()) for p in str(pages_str).split(",") if p.strip().isdigit()] | |
except: | |
return [] | |
def load_csv(filepath: str) -> list: | |
df = pd.read_csv(filepath) | |
df.columns = [c.strip().lower() for c in df.columns] | |
documents = [] | |
for _, row in df.iterrows(): | |
text = row.get('contenu', '') | |
if pd.isna(text) or not str(text).strip(): | |
continue | |
meta = { | |
'doc': ( | |
row.get('doc').strip() | |
if pd.notna(row.get('doc')) and str(row.get('doc')).strip() | |
else os.path.splitext(os.path.basename(filepath))[0] | |
), | |
'titre': row.get('titre', ''), | |
'chapitre': row.get('chapitre', ''), | |
'article': row.get('article', ''), | |
'sous_titre1': row.get('sous_titre1', ''), | |
'sous_titre2': row.get('sous_titre2', ''), | |
'sous_titre3': row.get('sous_titre3', ''), | |
'pages': ", ".join(map(str, parse_pages(row.get('pages', '')))) # <- stringify to avoid Chroma error | |
} | |
documents.append(Document(page_content=str(text).strip(), metadata=meta)) | |
print(f"π Loaded: {os.path.basename(filepath)} β {len(documents)} documents") | |
return documents | |
def process_files(): | |
all_docs = [] | |
for file in os.listdir(DATA_DIR): | |
if file.endswith(".csv"): | |
full_path = os.path.join(DATA_DIR, file) | |
all_docs.extend(load_csv(full_path)) | |
print(f"π Total: {len(all_docs)} full documents") | |
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) | |
db = Chroma.from_documents(all_docs, embedding=embeddings, persist_directory=PERSIST_DIR) | |
db.persist() | |
print(f"β Done! Stored in {PERSIST_DIR}") | |
if __name__ == "__main__": | |
process_files() | |