File size: 7,093 Bytes
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import extract_equal_text
from src.application.text.text import TextDetector


def create_fact_checker_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    rows.append(format_image_fact_checker_row(image))

    for _, row in aligned_sentences_df.iterrows():
        if row["input"] is None:
            continue

        if row["source"] is None:
            equal_idx_1 = equal_idx_2 = []

        else:  # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                row["input"],
                row["source"],
            )

        text.fact_checker_table.append(
            [
                row,  # aligned_sentences_df
                equal_idx_1,  # index of equal text in input
                equal_idx_2,  # index of equal text in source
                row["entities"],
                row["url"],
            ],
        )

    previous_url = None
    span_row = 1
    for index, row in enumerate(text.fact_checker_table):
        current_url = row[4]
        last_url_row = False

        # First row or URL change
        if index == 0 or current_url != previous_url:
            first_url_row = True
            previous_url = current_url
            # Increase counter "span_row" when the next url is the same
            while (
                index + span_row < len(text.fact_checker_table)
                and text.fact_checker_table[index + span_row][4] == current_url
            ):
                span_row += 1

        else:
            first_url_row = False
            span_row -= 1

        if span_row == 1:
            last_url_row = True

        formatted_row = format_text_fact_checker_row(
            text,
            row,
            first_url_row,
            last_url_row,
            span_row,
        )
        rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>
<style>
"""


def format_text_fact_checker_row(
    text: TextDetector,
    row: list,
    first_url_row: bool=True,
    last_url_row: bool=True,
    span_row: int=1,
):
    entity_count = 0
    print(f"row: {row}")
    if row[0]["input"] is None:
        return ""
    if row[0]["source"] is not None:  # source is not empty
        if row[3] is not None:
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],
                "input",
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],
                "source",
            )
        else:
            input_sentence = row[0]["input"]
            source_sentence = row[0]["source"]
            highlight_idx_input = []
            highlight_idx_source = []

        if row[3] is not None:
            entity_count = len(row[3])

        # Color overlapping words
        input_sentence = color_text(
            input_sentence,
            row[1],
            highlight_idx_input,
        )  # text, index of highlight words
        source_sentence = color_text(
            source_sentence,
            row[2],
            highlight_idx_source,
        )  # text, index of highlight words

        # Replace _ to get correct formatting
        # Original one having _ for correct word counting
        input_sentence = input_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
        source_sentence = source_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
    else:
        input_sentence = row[0]["input"]
        source_sentence = row[0]["source"]

    url = row[0]["url"]

    # Displayed label and score by url
    filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
    if len(filterby_url) > 0:
        label = filterby_url["label"].values[0]
        score = filterby_url["score"].values[0]
    else:
        label = text.prediction_label[0]
        score = text.prediction_score[0]

    # Format displayed url
    source_text_url = f"""<a href="{url}">{url}</a>"""

    # Format displayed entity count
    entity_count_text = format_entity_count(entity_count)

    border_top = "border-top: 1px solid transparent;"
    border_bottom = "border-bottom: 1px solid transparent;"
    if first_url_row is True:
        # First & Last the group: no transparent
        if last_url_row is True:
            return f"""
<tr>
    <td>{input_sentence}</td>
    <td>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
        # First row of the group: transparent bottom border
        return f"""
<tr>
    <td style="{border_bottom}";>{input_sentence}</td>
    <td style="{border_bottom}";>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
    else:
        if last_url_row is True:
            # NOT First row, Last row: transparent top border
            return f"""
<tr>
    <td style="{border_top}";>{input_sentence}</td>
    <td style="{border_top}";>{source_sentence}</td>
</tr>
"""
        else:
            # NOT First & NOT Last row: transparent top & bottom borders
            return f"""
<tr>
    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
</tr>
"""


def format_image_fact_checker_row(image):
    if image.referent_url is not None or image.referent_url != "":
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>input image</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>
"""