Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 6,414 Bytes
5d4054c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
from haystack.document_stores import InMemoryDocumentStore
import pandas as pd
import os
import pathlib
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
)
from typing import Any
INC_TEST_DATASET_PATH = os.path.join("data", "inc_df.csv")
EMBEDDING_DIMENSION = 512
special_character_words_mapper = {
"Côte D'Ivoire": "Côte DIvoire",
"Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
"Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
}
special_character_words_reverse_mapper = {}
for key, value in special_character_words_mapper.items():
special_character_words_reverse_mapper[value] = key
def transform_to_list(row):
special_characters = False
if str(row) == "[]" or str(row) == "nan":
return []
else:
# replace special characters
for key, value in special_character_words_mapper.items():
if key in row:
row = row.replace(key, value)
special_characters = True
row = ast.literal_eval(row)
if special_characters:
for key, value in special_character_words_reverse_mapper.items():
if key in row:
# get the index of the special character word
index = row.index(key)
# replace the special character word with the original word
row[index] = value
return row
def transform_data(df: pd.DataFrame):
# df["author"] = df["authors"].drop(columns=["authors"], axis=1)
df = df[df["doc_subtype"] != "Working documents"]
df = df[df["doc_subtype"] != "Contact Groups"]
df = df[df["doc_subtype"] != "Unsolicitated Submissions"]
df = df[df["doc_type"] != "official document"]
df = df[df["doc_subtype"] != "Stakeholder Dialogue"]
df["text"] = df["text"].astype(str).str.replace("_x000D_", " ")
df["text"] = df["text"].astype(str).str.replace("\n", " ")
# df["text"] = df["text"].astype(str).str.replace("\r", " ")
df["author"] = df["author"].str.replace("\xa0", " ")
df["author"] = df["author"].str.replace("ü", "u")
df["author"] = df["author"].str.strip()
df["author"] = df["author"].astype(str).str.replace("\r", " ")
df = df[
[
"author",
"doc_type",
"round",
"text",
"href",
"draft_labs",
"draft_cats",
"retriever_id",
]
].copy()
df = df.rename(columns={"text": "page_content"}).copy()
df["draft_labs2"] = df["draft_labs"]
df["author2"] = df["author"]
df["draft_labs"] = df.apply(lambda x: transform_to_list(x["draft_labs"]), axis=1)
df["author"] = df.apply(lambda x: transform_to_list(x["author"]), axis=1)
# df["draft_labs"] = df["draft_labs"].apply(
# lambda x: ast.literal_eval(x) if str(x) != "[]" or str(x) != "nan" else []
# )
# df["author"] = df["author"].apply(
# lambda x: ast.literal_eval(x) if str(x) != "[]" else []
# )
mlb = MultiLabelBinarizer(sparse_output=True)
mlb = MultiLabelBinarizer()
df = df.join(
pd.DataFrame(
mlb.fit_transform(df.pop("draft_labs")),
columns=mlb.classes_,
index=df.index,
)
).join(
pd.DataFrame(
mlb.fit_transform(df.pop("author")), columns=mlb.classes_, index=df.index
)
)
df["draft_labs"] = df["draft_labs2"]
df = df.drop(columns=["draft_labs2"], axis=1)
df["author"] = df["author2"]
df = df.drop(columns=["author2"], axis=1)
loader = DataFrameLoader(df, page_content_column="page_content")
docs = loader.load()
return docs
def process_data(docs):
chunk_size = 512
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=int(chunk_size / 10),
add_start_index=True,
strip_whitespace=True,
separators=["\n\n", "\n", " ", ""],
)
docs_chunked = text_splitter.transform_documents(docs)
df = pd.DataFrame(docs_chunked, columns=["page_content", "metadata", "type"]).drop(
"type", axis=1
)
df["page_content"] = df["page_content"].astype(str)
df["page_content"] = df["page_content"].str.replace("'page_content'", "")
df["page_content"] = df["page_content"].str.replace("(", "")
df["page_content"] = df["page_content"].str.replace(")", "").str[1:]
df = pd.concat(
[df.drop("metadata", axis=1), df["metadata"].apply(pd.Series)], axis=1
)
df = df.rename(columns={0: "a", 1: "b"})
df = pd.concat([df.drop(["a", "b"], axis=1), df["b"].apply(pd.Series)], axis=1)
cols = ["author", "draft_labs"]
for c in cols:
df[c] = df[c].apply(
lambda x: "".join(x) if isinstance(x, (list, tuple)) else str(x)
)
chars = ["[", "]", "'"]
for g in chars:
df[c] = df[c].str.replace(g, "")
df["page_content"] = df["page_content"].astype(str).str.replace("\n", " ")
df["page_content"] = df["page_content"].astype(str).str.replace("\r", " ")
cols = ["author", "draft_labs", "page_content"]
df["page_content"] = df[cols].apply(lambda row: " | ".join(row.astype(str)), axis=1)
df = df.rename(columns={"page_content": "content"})
documents = []
for _, row in df.iterrows():
row_meta: dict[str, Any] = {}
for column in df.columns:
if column != "content":
if column == "retriever_id":
row_meta[column] = str(row[column])
else:
row_meta[column] = row[column]
documents.append({"content": row["content"], "meta": row_meta})
return documents
def get_document_store():
df = pd.read_csv(INC_TEST_DATASET_PATH)
# df["retriever_id"] = [str(i) for i in range(len(df))]
pathlib.Path("database").mkdir(parents=True, exist_ok=True)
document_store = InMemoryDocumentStore(
embedding_field="embedding", embedding_dim=EMBEDDING_DIMENSION, use_bm25=False
)
docs = transform_data(df=df)
document_store.write_documents(process_data(docs=docs))
return document_store
|