File size: 9,152 Bytes
dc5bb62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
"""
This file contains some functions used to analyze the data from requests and interventions.
"""

import re
import datetime as dt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from torch import Tensor
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F


SUPPLIES_TAGS = {
        'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
        'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
        'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
        'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
        'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
        'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
        'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
        'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
        'vêtements': 'VÊTEMENTS / CLOTHES / الملابس',
        'clothes': 'VÊTEMENTS / CLOTHES / الملابس',
        'secours': 'SECOURS / RESCUE / الإنقاذ',
        'rescue': 'SECOURS / RESCUE / الإنقاذ',
        'refuge': 'REFUGE / SHELTER / المأوى',
        'shelter': 'REFUGE / SHELTER / المأوى',
        'couvertures': 'COUVERTURES / COVERS / البطانيات',
        'covers': 'COUVERTURES / COVERS / البطانيات',
        'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
        'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
        'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
        'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
        'blankets': 'COUVERTURES / COVERS / البطانيات',
        'tents': 'REFUGE / SHELTER / المأوى',
        'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية'
    }

SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
                       'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
                       'VÊTEMENTS / CLOTHES / الملابس',
                       'SECOURS / RESCUE / الإنقاذ',
                       'REFUGE / SHELTER / المأوى',
                       'COUVERTURES / COVERS / البطانيات',
                       # 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ',
                       'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
                       'OTHER']

TRANSLATION_DICT = {
    'أغطية': 'covers',
    'أسرة': 'beds',
    'وسادات': 'pillows',
    'مصابح': 'lamps',
    'خيام': 'tents',
    'ألعاب أطفال': 'toys',
    'قليل من المواد الغذائية': 'food',
    'افرشة': 'covers',
    'جلباب': 'clothes',
    'ملابس': 'clothes',
    'لديهم كل شيء': 'unknown'
}


def clean_text(text):
    """
    remove special characters from text
    """
    pattern = re.compile(r'[\u200e\xa0()\u200f]')
    cleaned_text = pattern.sub('', text)
    return cleaned_text


def contains_arabic(text):
    """
    check if the text contains arabic characters
    """
    arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
    if type(text)!=str:
      return False
    return arabic_pattern.search(text) is not None


def arabic_to_latin_punctuation(text):
    """
    replace arabic punctuation with latin punctuation
    """
    punctuation_mapping = {
        '،': ',',
        '؛': ';',
        'ـ': '_',
        '؟': '?',
        '٪': '%',
        '٫': '.',
    }

    for arabic_punct, latin_punct in punctuation_mapping.items():
        text = text.replace(arabic_punct, latin_punct)

    return text


def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str):
    """Plot the timeline of requests and interventions.
    """
    df_past = df[df[date_col]<=today.date()]
    df_future = df[df[date_col]>today.date()]

    count_past = (df_past
                  .groupby(date_col)
                  .size()
                  .rename('count')
                  .reset_index())
    past_date_range = pd.date_range(start=min(count_past[date_col]), 
                                    end=today.date(), 
                                    freq='D')
    count_past = (count_past
                  .set_index(date_col)
                  .reindex(past_date_range, fill_value=0)
                  .reset_index())

    if len(df_future)>0:
        count_future = df_future.groupby(date_col).size().rename('count').reset_index()
        future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1), 
                                          end=max(count_future[date_col]), 
                                          freq='D')
        count_future = (count_future
                        .set_index(date_col)
                        .reindex(future_date_range, fill_value=0)
                        .reset_index())
    else:
        count_future = pd.DataFrame()

    bridge_date = today.date()
    bridge_data = pd.DataFrame(
        {'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0])
    count_future = pd.concat([bridge_data, count_future], ignore_index=True)

    # Plot
    fig = go.Figure()
    # past 
    fig.add_trace(go.Scatter(x=count_past['index'], 
                             y=count_past['count'], 
                             mode='lines',
                             name='Past Interventions', 
                             line=dict(color='blue')))
    # future
    fig.add_trace(go.Scatter(x=count_future['index'], 
                             y=count_future['count'], 
                             mode='lines',
                             name='Future Interventions', 
                             line=dict(color='orange')))

    fig.add_vline(x=today.date(), line_dash="dash", line_color="black")

    fig.update_layout(yaxis_title="#", xaxis_title='date')
    return fig


def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False):
    """ Classifies text into supplies categories from SUPPLIES_TAGS
      using a rule-based approach."""
    classes = []
    lowercase_text = text.lower()  # case-insensitive matching

    for keyword, category in SUPPLIES_TAGS.items():
        if keyword in lowercase_text:
            classes.append(category)

    if keep_raw:
        classes.append(lowercase_text)

    elif not classes:
        classes.append('OTHER')

    return list(set(classes))


def classify_multilingual_field_e5(df: pd.DataFrame,
                      field_to_tag: str = 'supplies', 
                      categories: list = SUPPLIES_NEEDS_CATEGORIES):
    """
    Tag supplies/requests into categories using multilingual-e5-large model.
    Returns a dataframe with a new column containing the list of predicted categories.
    Requires CUDA
    """
    def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(
            ~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
    model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')
    model.cuda()

    # classify ar supplies
    processed_df = df.copy()
    values_to_classify = processed_df[field_to_tag]

    mapped_inputs = dict()

    for text in values_to_classify:
        gt = [f"{s}" for s in categories]
        qr = [f"{v}" for v in re.split("\.|,| و", text)]
        input_texts = qr + gt

    # Tokenize the input texts
    batch_dict = tokenizer(
        input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
    batch_dict = {k: v.cuda() for k, v in batch_dict.items()}

    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100

    mapped_inputs[text] = list(
        set([categories[int(scores[i,:].argmax())] for i in range(len(qr))]))

    processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list(
        mapped_inputs.values())
    
    return processed_df


def plot_categories_share(raw_df: pd.DataFrame, 
                          today: dt.datetime, 
                          field: str = 'supplies'):
    """
    Plot the share of each category of requests/supplies.
    """
    df = raw_df[[field, f'{field}_category']].explode(f'{field}_category')
    pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n')
    fig = px.pie(pie_data, 
                 names=f'{field}_category', 
                 values='n', 
                 title=f'# per {field} category up till {today.date()}',
                labels={f'{field}_category': f'{field}', 'n': '%'})
    return fig