NegotiateAI / src /document_store /document_store.py
TeresaK's picture
Upload 35 files
5d4054c verified
raw
history blame
6.41 kB
from haystack.document_stores import InMemoryDocumentStore
import pandas as pd
import os
import pathlib
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
)
from typing import Any
INC_TEST_DATASET_PATH = os.path.join("data", "inc_df.csv")
EMBEDDING_DIMENSION = 512
special_character_words_mapper = {
"Côte D'Ivoire": "Côte DIvoire",
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
}
special_character_words_reverse_mapper = {}
for key, value in special_character_words_mapper.items():
special_character_words_reverse_mapper[value] = key
def transform_to_list(row):
special_characters = False
if str(row) == "[]" or str(row) == "nan":
return []
else:
# replace special characters
for key, value in special_character_words_mapper.items():
if key in row:
row = row.replace(key, value)
special_characters = True
row = ast.literal_eval(row)
if special_characters:
for key, value in special_character_words_reverse_mapper.items():
if key in row:
# get the index of the special character word
index = row.index(key)
# replace the special character word with the original word
row[index] = value
return row
def transform_data(df: pd.DataFrame):
# df["author"] = df["authors"].drop(columns=["authors"], axis=1)
df = df[df["doc_subtype"] != "Working documents"]
df = df[df["doc_subtype"] != "Contact Groups"]
df = df[df["doc_subtype"] != "Unsolicitated Submissions"]
df = df[df["doc_type"] != "official document"]
df = df[df["doc_subtype"] != "Stakeholder Dialogue"]
df["text"] = df["text"].astype(str).str.replace("_x000D_", " ")
df["text"] = df["text"].astype(str).str.replace("\n", " ")
# df["text"] = df["text"].astype(str).str.replace("\r", " ")
df["author"] = df["author"].str.replace("\xa0", " ")
df["author"] = df["author"].str.replace("ü", "u")
df["author"] = df["author"].str.strip()
df["author"] = df["author"].astype(str).str.replace("\r", " ")
df = df[
[
"author",
"doc_type",
"round",
"text",
"href",
"draft_labs",
"draft_cats",
"retriever_id",
]
].copy()
df = df.rename(columns={"text": "page_content"}).copy()
df["draft_labs2"] = df["draft_labs"]
df["author2"] = df["author"]
df["draft_labs"] = df.apply(lambda x: transform_to_list(x["draft_labs"]), axis=1)
df["author"] = df.apply(lambda x: transform_to_list(x["author"]), axis=1)
# df["draft_labs"] = df["draft_labs"].apply(
# lambda x: ast.literal_eval(x) if str(x) != "[]" or str(x) != "nan" else []
# )
# df["author"] = df["author"].apply(
# lambda x: ast.literal_eval(x) if str(x) != "[]" else []
# )
mlb = MultiLabelBinarizer(sparse_output=True)
mlb = MultiLabelBinarizer()
df = df.join(
pd.DataFrame(
mlb.fit_transform(df.pop("draft_labs")),
columns=mlb.classes_,
index=df.index,
)
).join(
pd.DataFrame(
mlb.fit_transform(df.pop("author")), columns=mlb.classes_, index=df.index
)
)
df["draft_labs"] = df["draft_labs2"]
df = df.drop(columns=["draft_labs2"], axis=1)
df["author"] = df["author2"]
df = df.drop(columns=["author2"], axis=1)
loader = DataFrameLoader(df, page_content_column="page_content")
docs = loader.load()
return docs
def process_data(docs):
chunk_size = 512
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", " ", ""],
)
docs_chunked = text_splitter.transform_documents(docs)
df = pd.DataFrame(docs_chunked, columns=["page_content", "metadata", "type"]).drop(
"type", axis=1
)
df["page_content"] = df["page_content"].astype(str)
df["page_content"] = df["page_content"].str.replace("'page_content'", "")
df["page_content"] = df["page_content"].str.replace("(", "")
df["page_content"] = df["page_content"].str.replace(")", "").str[1:]
df = pd.concat(
[df.drop("metadata", axis=1), df["metadata"].apply(pd.Series)], axis=1
)
df = df.rename(columns={0: "a", 1: "b"})
df = pd.concat([df.drop(["a", "b"], axis=1), df["b"].apply(pd.Series)], axis=1)
cols = ["author", "draft_labs"]
for c in cols:
df[c] = df[c].apply(
lambda x: "".join(x) if isinstance(x, (list, tuple)) else str(x)
)
chars = ["[", "]", "'"]
for g in chars:
df[c] = df[c].str.replace(g, "")
df["page_content"] = df["page_content"].astype(str).str.replace("\n", " ")
df["page_content"] = df["page_content"].astype(str).str.replace("\r", " ")
cols = ["author", "draft_labs", "page_content"]
df["page_content"] = df[cols].apply(lambda row: " | ".join(row.astype(str)), axis=1)
df = df.rename(columns={"page_content": "content"})
documents = []
for _, row in df.iterrows():
row_meta: dict[str, Any] = {}
for column in df.columns:
if column != "content":
if column == "retriever_id":
row_meta[column] = str(row[column])
else:
row_meta[column] = row[column]
documents.append({"content": row["content"], "meta": row_meta})
return documents
def get_document_store():
df = pd.read_csv(INC_TEST_DATASET_PATH)
# df["retriever_id"] = [str(i) for i in range(len(df))]
pathlib.Path("database").mkdir(parents=True, exist_ok=True)
document_store = InMemoryDocumentStore(
embedding_field="embedding", embedding_dim=EMBEDDING_DIMENSION, use_bm25=False
)
docs = transform_data(df=df)
document_store.write_documents(process_data(docs=docs))
return document_store