File size: 5,025 Bytes
e58707f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import extract_equal_text
from src.application.text.text import TextDetector


def create_governor_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    rows.append(format_image_governor_row(image))

    for _, row in aligned_sentences_df.iterrows():
        if row["input"] is None:
            continue

        if row["source"] is None:
            equal_idx_1 = equal_idx_2 = []
        else:
            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                row["input"],
                row["source"],
            )

        text.governor_table.append(
            [
                row,
                equal_idx_1,
                equal_idx_2,
                row["entities"],
            ],
        )

    formatted_row = format_text_governor_row(text)
    rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
        """


def format_text_governor_row(text):
    input_sentences = ""
    source_sentences = ""
    source_text_urls = ""
    urls = []
    sentence_count = 0
    entity_count = [0, 0]  # to get index of [-2]
    for row in text.governor_table:
        if row[0]["input"] is None:
            continue

        if row[0]["source"] is not None:  # source is not empty
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],  # entities_with_colors
                "input",  # key
                entity_count[-2],  # since the last one is for current counting
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],  # entities_with_colors
                "source",  # key
                entity_count[-2],  # since the last one is for current counting
            )

            # Color overlapping words
            input_sentence = color_text(
                input_sentence,
                row[1],
                highlight_idx_input,
            )  # text, index of highlight words
            source_sentence = color_text(
                source_sentence,
                row[2],
                highlight_idx_source,
            )  # text, index of highlight words

            input_sentence = input_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")
            source_sentence = source_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")

        else:
            if row[0]["source"] is None:
                source_sentence = ""
            else:
                source_sentence = row[0]["source"]
            input_sentence = row[0]["input"]

        # convert score to HUMAN-based score:
        input_sentences += input_sentence + "<br><br>"
        source_sentences += source_sentence + "<br><br>"

        url = row[0]["url"]
        if url not in urls:
            urls.append(url)
            source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
            sentence_count += 1
            if row[3] is not None:
                entity_count.append(len(row[3]))

    entity_count_text = format_entity_count(sum(entity_count))
    return f"""
<tr>
    <td>{input_sentences}</td>
    <td>{source_sentences}</td>
    <td>{text.prediction_label[0]}<br>
        ({text.prediction_score[0] * 100:.2f}%)<br><br>
        {entity_count_text}</td>
    <td style="{WORD_BREAK}";>{source_text_urls}</td>
</tr>
"""


def format_image_governor_row(image):
    if image.referent_url is not None or image.referent_url != "":
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>input image</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>"""