File size: 6,414 Bytes
5d4054c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from haystack.document_stores import InMemoryDocumentStore
import pandas as pd
import os
import pathlib
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from langchain_community.document_loaders import DataFrameLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
)
from typing import Any


INC_TEST_DATASET_PATH = os.path.join("data", "inc_df.csv")
EMBEDDING_DIMENSION = 512

special_character_words_mapper = {
    "Côte D'Ivoire": "Côte DIvoire",
    "Ligue Camerounaise Des Droits De L'Homme": "Ligue Camerounaise Des Droits De LHomme",
    "Association Pour L'Integration Et La Developpement Durable Au Burundi": "Association Pour LIntegration Et La Developpement Durable Au Burundi",
}
special_character_words_reverse_mapper = {}
for key, value in special_character_words_mapper.items():
    special_character_words_reverse_mapper[value] = key


def transform_to_list(row):
    special_characters = False
    if str(row) == "[]" or str(row) == "nan":
        return []
    else:
        # replace special characters
        for key, value in special_character_words_mapper.items():
            if key in row:
                row = row.replace(key, value)
                special_characters = True
        row = ast.literal_eval(row)
        if special_characters:
            for key, value in special_character_words_reverse_mapper.items():
                if key in row:
                    # get the index of the special character word
                    index = row.index(key)
                    # replace the special character word with the original word
                    row[index] = value
        return row


def transform_data(df: pd.DataFrame):
    # df["author"] = df["authors"].drop(columns=["authors"], axis=1)
    df = df[df["doc_subtype"] != "Working documents"]
    df = df[df["doc_subtype"] != "Contact Groups"]
    df = df[df["doc_subtype"] != "Unsolicitated Submissions"]
    df = df[df["doc_type"] != "official document"]
    df = df[df["doc_subtype"] != "Stakeholder Dialogue"]
    df["text"] = df["text"].astype(str).str.replace("_x000D_", " ")
    df["text"] = df["text"].astype(str).str.replace("\n", " ")
    # df["text"] = df["text"].astype(str).str.replace("\r", " ")
    df["author"] = df["author"].str.replace("\xa0", " ")
    df["author"] = df["author"].str.replace("ü", "u")
    df["author"] = df["author"].str.strip()
    df["author"] = df["author"].astype(str).str.replace("\r", " ")

    df = df[
        [
            "author",
            "doc_type",
            "round",
            "text",
            "href",
            "draft_labs",
            "draft_cats",
            "retriever_id",
        ]
    ].copy()

    df = df.rename(columns={"text": "page_content"}).copy()

    df["draft_labs2"] = df["draft_labs"]
    df["author2"] = df["author"]

    df["draft_labs"] = df.apply(lambda x: transform_to_list(x["draft_labs"]), axis=1)
    df["author"] = df.apply(lambda x: transform_to_list(x["author"]), axis=1)

    # df["draft_labs"] = df["draft_labs"].apply(
    #     lambda x: ast.literal_eval(x) if str(x) != "[]" or str(x) != "nan" else []
    # )
    # df["author"] = df["author"].apply(
    #     lambda x: ast.literal_eval(x) if str(x) != "[]" else []
    # )

    mlb = MultiLabelBinarizer(sparse_output=True)
    mlb = MultiLabelBinarizer()
    df = df.join(
        pd.DataFrame(
            mlb.fit_transform(df.pop("draft_labs")),
            columns=mlb.classes_,
            index=df.index,
        )
    ).join(
        pd.DataFrame(
            mlb.fit_transform(df.pop("author")), columns=mlb.classes_, index=df.index
        )
    )

    df["draft_labs"] = df["draft_labs2"]
    df = df.drop(columns=["draft_labs2"], axis=1)

    df["author"] = df["author2"]
    df = df.drop(columns=["author2"], axis=1)

    loader = DataFrameLoader(df, page_content_column="page_content")
    docs = loader.load()
    return docs


def process_data(docs):

    chunk_size = 512
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", " ", ""],
    )

    docs_chunked = text_splitter.transform_documents(docs)

    df = pd.DataFrame(docs_chunked, columns=["page_content", "metadata", "type"]).drop(
        "type", axis=1
    )
    df["page_content"] = df["page_content"].astype(str)
    df["page_content"] = df["page_content"].str.replace("'page_content'", "")
    df["page_content"] = df["page_content"].str.replace("(", "")
    df["page_content"] = df["page_content"].str.replace(")", "").str[1:]
    df = pd.concat(
        [df.drop("metadata", axis=1), df["metadata"].apply(pd.Series)], axis=1
    )
    df = df.rename(columns={0: "a", 1: "b"})
    df = pd.concat([df.drop(["a", "b"], axis=1), df["b"].apply(pd.Series)], axis=1)

    cols = ["author", "draft_labs"]
    for c in cols:
        df[c] = df[c].apply(
            lambda x: "".join(x) if isinstance(x, (list, tuple)) else str(x)
        )
        chars = ["[", "]", "'"]
        for g in chars:
            df[c] = df[c].str.replace(g, "")

    df["page_content"] = df["page_content"].astype(str).str.replace("\n", " ")
    df["page_content"] = df["page_content"].astype(str).str.replace("\r", " ")

    cols = ["author", "draft_labs", "page_content"]
    df["page_content"] = df[cols].apply(lambda row: " | ".join(row.astype(str)), axis=1)
    df = df.rename(columns={"page_content": "content"})

    documents = []
    for _, row in df.iterrows():
        row_meta: dict[str, Any] = {}
        for column in df.columns:
            if column != "content":
                if column == "retriever_id":
                    row_meta[column] = str(row[column])
                else:
                    row_meta[column] = row[column]
        documents.append({"content": row["content"], "meta": row_meta})
    return documents


def get_document_store():
    df = pd.read_csv(INC_TEST_DATASET_PATH)
    # df["retriever_id"] = [str(i) for i in range(len(df))]
    pathlib.Path("database").mkdir(parents=True, exist_ok=True)
    document_store = InMemoryDocumentStore(
        embedding_field="embedding", embedding_dim=EMBEDDING_DIMENSION, use_bm25=False
    )
    docs = transform_data(df=df)
    document_store.write_documents(process_data(docs=docs))
    return document_store