Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from haystack.document_stores import InMemoryDocumentStore | |
import pandas as pd | |
import os | |
import pathlib | |
import ast | |
from sklearn.preprocessing import MultiLabelBinarizer | |
from langchain_community.document_loaders import DataFrameLoader | |
from langchain.text_splitter import ( | |
RecursiveCharacterTextSplitter, | |
) | |
from typing import Any | |
INC_TEST_DATASET_PATH = os.path.join("data", "inc_df.csv") | |
EMBEDDING_DIMENSION = 512 | |
special_character_words_mapper = { | |
"Côte D'Ivoire": "Côte DIvoire", | |
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme", | |
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi", | |
} | |
special_character_words_reverse_mapper = {} | |
for key, value in special_character_words_mapper.items(): | |
special_character_words_reverse_mapper[value] = key | |
def transform_to_list(row): | |
special_characters = False | |
if str(row) == "[]" or str(row) == "nan": | |
return [] | |
else: | |
# replace special characters | |
for key, value in special_character_words_mapper.items(): | |
if key in row: | |
row = row.replace(key, value) | |
special_characters = True | |
row = ast.literal_eval(row) | |
if special_characters: | |
for key, value in special_character_words_reverse_mapper.items(): | |
if key in row: | |
# get the index of the special character word | |
index = row.index(key) | |
# replace the special character word with the original word | |
row[index] = value | |
return row | |
def transform_data(df: pd.DataFrame): | |
# df["author"] = df["authors"].drop(columns=["authors"], axis=1) | |
df = df[df["doc_subtype"] != "Working documents"] | |
df = df[df["doc_subtype"] != "Contact Groups"] | |
df = df[df["doc_subtype"] != "Unsolicitated Submissions"] | |
df = df[df["doc_type"] != "official document"] | |
df = df[df["doc_subtype"] != "Stakeholder Dialogue"] | |
df["text"] = df["text"].astype(str).str.replace("_x000D_", " ") | |
df["text"] = df["text"].astype(str).str.replace("\n", " ") | |
# df["text"] = df["text"].astype(str).str.replace("\r", " ") | |
df["author"] = df["author"].str.replace("\xa0", " ") | |
df["author"] = df["author"].str.replace("ü", "u") | |
df["author"] = df["author"].str.strip() | |
df["author"] = df["author"].astype(str).str.replace("\r", " ") | |
df = df[ | |
[ | |
"author", | |
"doc_type", | |
"round", | |
"text", | |
"href", | |
"draft_labs", | |
"draft_cats", | |
"retriever_id", | |
] | |
].copy() | |
df = df.rename(columns={"text": "page_content"}).copy() | |
df["draft_labs2"] = df["draft_labs"] | |
df["author2"] = df["author"] | |
df["draft_labs"] = df.apply(lambda x: transform_to_list(x["draft_labs"]), axis=1) | |
df["author"] = df.apply(lambda x: transform_to_list(x["author"]), axis=1) | |
# df["draft_labs"] = df["draft_labs"].apply( | |
# lambda x: ast.literal_eval(x) if str(x) != "[]" or str(x) != "nan" else [] | |
# ) | |
# df["author"] = df["author"].apply( | |
# lambda x: ast.literal_eval(x) if str(x) != "[]" else [] | |
# ) | |
mlb = MultiLabelBinarizer(sparse_output=True) | |
mlb = MultiLabelBinarizer() | |
df = df.join( | |
pd.DataFrame( | |
mlb.fit_transform(df.pop("draft_labs")), | |
columns=mlb.classes_, | |
index=df.index, | |
) | |
).join( | |
pd.DataFrame( | |
mlb.fit_transform(df.pop("author")), columns=mlb.classes_, index=df.index | |
) | |
) | |
df["draft_labs"] = df["draft_labs2"] | |
df = df.drop(columns=["draft_labs2"], axis=1) | |
df["author"] = df["author2"] | |
df = df.drop(columns=["author2"], axis=1) | |
loader = DataFrameLoader(df, page_content_column="page_content") | |
docs = loader.load() | |
return docs | |
def process_data(docs): | |
chunk_size = 512 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=int(chunk_size / 10), | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=["\n\n", "\n", " ", ""], | |
) | |
docs_chunked = text_splitter.transform_documents(docs) | |
df = pd.DataFrame(docs_chunked, columns=["page_content", "metadata", "type"]).drop( | |
"type", axis=1 | |
) | |
df["page_content"] = df["page_content"].astype(str) | |
df["page_content"] = df["page_content"].str.replace("'page_content'", "") | |
df["page_content"] = df["page_content"].str.replace("(", "") | |
df["page_content"] = df["page_content"].str.replace(")", "").str[1:] | |
df = pd.concat( | |
[df.drop("metadata", axis=1), df["metadata"].apply(pd.Series)], axis=1 | |
) | |
df = df.rename(columns={0: "a", 1: "b"}) | |
df = pd.concat([df.drop(["a", "b"], axis=1), df["b"].apply(pd.Series)], axis=1) | |
cols = ["author", "draft_labs"] | |
for c in cols: | |
df[c] = df[c].apply( | |
lambda x: "".join(x) if isinstance(x, (list, tuple)) else str(x) | |
) | |
chars = ["[", "]", "'"] | |
for g in chars: | |
df[c] = df[c].str.replace(g, "") | |
df["page_content"] = df["page_content"].astype(str).str.replace("\n", " ") | |
df["page_content"] = df["page_content"].astype(str).str.replace("\r", " ") | |
cols = ["author", "draft_labs", "page_content"] | |
df["page_content"] = df[cols].apply(lambda row: " | ".join(row.astype(str)), axis=1) | |
df = df.rename(columns={"page_content": "content"}) | |
documents = [] | |
for _, row in df.iterrows(): | |
row_meta: dict[str, Any] = {} | |
for column in df.columns: | |
if column != "content": | |
if column == "retriever_id": | |
row_meta[column] = str(row[column]) | |
else: | |
row_meta[column] = row[column] | |
documents.append({"content": row["content"], "meta": row_meta}) | |
return documents | |
def get_document_store(): | |
df = pd.read_csv(INC_TEST_DATASET_PATH) | |
# df["retriever_id"] = [str(i) for i in range(len(df))] | |
pathlib.Path("database").mkdir(parents=True, exist_ok=True) | |
document_store = InMemoryDocumentStore( | |
embedding_field="embedding", embedding_dim=EMBEDDING_DIMENSION, use_bm25=False | |
) | |
docs = transform_data(df=df) | |
document_store.write_documents(process_data(docs=docs)) | |
return document_store | |