from haystack.document_stores import InMemoryDocumentStore import pandas as pd import os import pathlib import ast from sklearn.preprocessing import MultiLabelBinarizer from langchain_community.document_loaders import DataFrameLoader from langchain.text_splitter import ( RecursiveCharacterTextSplitter, ) from typing import Any INC_TEST_DATASET_PATH = os.path.join("data", "inc_df.csv") EMBEDDING_DIMENSION = 512 special_character_words_mapper = { "Côte D'Ivoire": "Côte DIvoire", "Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme", "Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi", } special_character_words_reverse_mapper = {} for key, value in special_character_words_mapper.items(): special_character_words_reverse_mapper[value] = key def transform_to_list(row): special_characters = False if str(row) == "[]" or str(row) == "nan": return [] else: # replace special characters for key, value in special_character_words_mapper.items(): if key in row: row = row.replace(key, value) special_characters = True row = ast.literal_eval(row) if special_characters: for key, value in special_character_words_reverse_mapper.items(): if key in row: # get the index of the special character word index = row.index(key) # replace the special character word with the original word row[index] = value return row def transform_data(df: pd.DataFrame): # df["author"] = df["authors"].drop(columns=["authors"], axis=1) df = df[df["doc_subtype"] != "Working documents"] df = df[df["doc_subtype"] != "Contact Groups"] df = df[df["doc_subtype"] != "Unsolicitated Submissions"] df = df[df["doc_type"] != "official document"] df = df[df["doc_subtype"] != "Stakeholder Dialogue"] df["text"] = df["text"].astype(str).str.replace("_x000D_", " ") df["text"] = df["text"].astype(str).str.replace("\n", " ") # df["text"] = df["text"].astype(str).str.replace("\r", " ") df["author"] = df["author"].str.replace("\xa0", " ") df["author"] = df["author"].str.replace("ü", "u") df["author"] = df["author"].str.strip() df["author"] = df["author"].astype(str).str.replace("\r", " ") df = df[ [ "author", "doc_type", "round", "text", "href", "draft_labs", "draft_cats", "retriever_id", ] ].copy() df = df.rename(columns={"text": "page_content"}).copy() df["draft_labs2"] = df["draft_labs"] df["author2"] = df["author"] df["draft_labs"] = df.apply(lambda x: transform_to_list(x["draft_labs"]), axis=1) df["author"] = df.apply(lambda x: transform_to_list(x["author"]), axis=1) # df["draft_labs"] = df["draft_labs"].apply( # lambda x: ast.literal_eval(x) if str(x) != "[]" or str(x) != "nan" else [] # ) # df["author"] = df["author"].apply( # lambda x: ast.literal_eval(x) if str(x) != "[]" else [] # ) mlb = MultiLabelBinarizer(sparse_output=True) mlb = MultiLabelBinarizer() df = df.join( pd.DataFrame( mlb.fit_transform(df.pop("draft_labs")), columns=mlb.classes_, index=df.index, ) ).join( pd.DataFrame( mlb.fit_transform(df.pop("author")), columns=mlb.classes_, index=df.index ) ) df["draft_labs"] = df["draft_labs2"] df = df.drop(columns=["draft_labs2"], axis=1) df["author"] = df["author2"] df = df.drop(columns=["author2"], axis=1) loader = DataFrameLoader(df, page_content_column="page_content") docs = loader.load() return docs def process_data(docs): chunk_size = 512 text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=int(chunk_size / 10), add_start_index=True, strip_whitespace=True, separators=["\n\n", "\n", " ", ""], ) docs_chunked = text_splitter.transform_documents(docs) df = pd.DataFrame(docs_chunked, columns=["page_content", "metadata", "type"]).drop( "type", axis=1 ) df["page_content"] = df["page_content"].astype(str) df["page_content"] = df["page_content"].str.replace("'page_content'", "") df["page_content"] = df["page_content"].str.replace("(", "") df["page_content"] = df["page_content"].str.replace(")", "").str[1:] df = pd.concat( [df.drop("metadata", axis=1), df["metadata"].apply(pd.Series)], axis=1 ) df = df.rename(columns={0: "a", 1: "b"}) df = pd.concat([df.drop(["a", "b"], axis=1), df["b"].apply(pd.Series)], axis=1) cols = ["author", "draft_labs"] for c in cols: df[c] = df[c].apply( lambda x: "".join(x) if isinstance(x, (list, tuple)) else str(x) ) chars = ["[", "]", "'"] for g in chars: df[c] = df[c].str.replace(g, "") df["page_content"] = df["page_content"].astype(str).str.replace("\n", " ") df["page_content"] = df["page_content"].astype(str).str.replace("\r", " ") cols = ["author", "draft_labs", "page_content"] df["page_content"] = df[cols].apply(lambda row: " | ".join(row.astype(str)), axis=1) df = df.rename(columns={"page_content": "content"}) documents = [] for _, row in df.iterrows(): row_meta: dict[str, Any] = {} for column in df.columns: if column != "content": if column == "retriever_id": row_meta[column] = str(row[column]) else: row_meta[column] = row[column] documents.append({"content": row["content"], "meta": row_meta}) return documents def get_document_store(): df = pd.read_csv(INC_TEST_DATASET_PATH) # df["retriever_id"] = [str(i) for i in range(len(df))] pathlib.Path("database").mkdir(parents=True, exist_ok=True) document_store = InMemoryDocumentStore( embedding_field="embedding", embedding_dim=EMBEDDING_DIMENSION, use_bm25=False ) docs = transform_data(df=df) document_store.write_documents(process_data(docs=docs)) return document_store