Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 5,025 Bytes

e58707f

from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import extract_equal_text
from src.application.text.text import TextDetector


def create_governor_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    rows.append(format_image_governor_row(image))

    for _, row in aligned_sentences_df.iterrows():
        if row["input"] is None:
            continue

        if row["source"] is None:
            equal_idx_1 = equal_idx_2 = []
        else:
            # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                row["input"],
                row["source"],
            )

        text.governor_table.append(
            [
                row,
                equal_idx_1,
                equal_idx_2,
                row["entities"],
            ],
        )

    formatted_row = format_text_governor_row(text)
    rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>

<style>
        """


def format_text_governor_row(text):
    input_sentences = ""
    source_sentences = ""
    source_text_urls = ""
    urls = []
    sentence_count = 0
    entity_count = [0, 0]  # to get index of [-2]
    for row in text.governor_table:
        if row[0]["input"] is None:
            continue

        if row[0]["source"] is not None:  # source is not empty
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],  # entities_with_colors
                "input",  # key
                entity_count[-2],  # since the last one is for current counting
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],  # entities_with_colors
                "source",  # key
                entity_count[-2],  # since the last one is for current counting
            )

            # Color overlapping words
            input_sentence = color_text(
                input_sentence,
                row[1],
                highlight_idx_input,
            )  # text, index of highlight words
            source_sentence = color_text(
                source_sentence,
                row[2],
                highlight_idx_source,
            )  # text, index of highlight words

            input_sentence = input_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")
            source_sentence = source_sentence.replace(
                "span_style",
                "span style",
            ).replace("1px_4px", "1px 4px")

        else:
            if row[0]["source"] is None:
                source_sentence = ""
            else:
                source_sentence = row[0]["source"]
            input_sentence = row[0]["input"]

        # convert score to HUMAN-based score:
        input_sentences += input_sentence + "<br><br>"
        source_sentences += source_sentence + "<br><br>"

        url = row[0]["url"]
        if url not in urls:
            urls.append(url)
            source_text_urls += f"""<a href="{url}">{url}</a><br><br>"""
            sentence_count += 1
            if row[3] is not None:
                entity_count.append(len(row[3]))

    entity_count_text = format_entity_count(sum(entity_count))
    return f"""
<tr>
    <td>{input_sentences}</td>
    <td>{source_sentences}</td>
    <td>{text.prediction_label[0]}<br>
        ({text.prediction_score[0] * 100:.2f}%)<br><br>
        {entity_count_text}</td>
    <td style="{WORD_BREAK}";>{source_text_urls}</td>
</tr>
"""


def format_image_governor_row(image):
    if image.referent_url is not None or image.referent_url != "":
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>input image</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>"""