Spaces:

pmkhanh7890
/

news_verification

Sleeping

File size: 7,093 Bytes

e58707f

from pandas import DataFrame

from src.application.config import WORD_BREAK
from src.application.formatting import (
    color_text,
    format_entity_count,
)
from src.application.image.image import ImageDetector
from src.application.text.entity import apply_highlight
from src.application.text.helper import extract_equal_text
from src.application.text.text import TextDetector


def create_fact_checker_table(
    aligned_sentences_df: DataFrame,
    text: TextDetector,
    image: ImageDetector,
):
    rows = []
    rows.append(format_image_fact_checker_row(image))

    for _, row in aligned_sentences_df.iterrows():
        if row["input"] is None:
            continue

        if row["source"] is None:
            equal_idx_1 = equal_idx_2 = []

        else:  # Get index of equal phrases in input and source sentences
            equal_idx_1, equal_idx_2 = extract_equal_text(
                row["input"],
                row["source"],
            )

        text.fact_checker_table.append(
            [
                row,  # aligned_sentences_df
                equal_idx_1,  # index of equal text in input
                equal_idx_2,  # index of equal text in source
                row["entities"],
                row["url"],
            ],
        )

    previous_url = None
    span_row = 1
    for index, row in enumerate(text.fact_checker_table):
        current_url = row[4]
        last_url_row = False

        # First row or URL change
        if index == 0 or current_url != previous_url:
            first_url_row = True
            previous_url = current_url
            # Increase counter "span_row" when the next url is the same
            while (
                index + span_row < len(text.fact_checker_table)
                and text.fact_checker_table[index + span_row][4] == current_url
            ):
                span_row += 1

        else:
            first_url_row = False
            span_row -= 1

        if span_row == 1:
            last_url_row = True

        formatted_row = format_text_fact_checker_row(
            text,
            row,
            first_url_row,
            last_url_row,
            span_row,
        )
        rows.append(formatted_row)

    table = "\n".join(rows)
    return f"""
<h5>Comparison between input news and source news:</h5>
<table border="1" style="width:100%; text-align:left;">
<col style="width: 170px;">
<col style="width: 170px;">
<col style="width: 30px;">
<col style="width: 75px;">
    <thead>
        <tr>
            <th>Input news</th>
            <th>Source (URL in Originality)</th>
            <th>Forensic</th>
            <th>Originality</th>
        </tr>
    </thead>
    <tbody>
        {table}
    </tbody>
</table>
<style>
"""


def format_text_fact_checker_row(
    text: TextDetector,
    row: list,
    first_url_row: bool=True,
    last_url_row: bool=True,
    span_row: int=1,
):
    entity_count = 0
    print(f"row: {row}")
    if row[0]["input"] is None:
        return ""
    if row[0]["source"] is not None:  # source is not empty
        if row[3] is not None:
            # highlight entities
            input_sentence, highlight_idx_input = apply_highlight(
                row[0]["input"],
                row[3],
                "input",
            )
            source_sentence, highlight_idx_source = apply_highlight(
                row[0]["source"],
                row[3],
                "source",
            )
        else:
            input_sentence = row[0]["input"]
            source_sentence = row[0]["source"]
            highlight_idx_input = []
            highlight_idx_source = []

        if row[3] is not None:
            entity_count = len(row[3])

        # Color overlapping words
        input_sentence = color_text(
            input_sentence,
            row[1],
            highlight_idx_input,
        )  # text, index of highlight words
        source_sentence = color_text(
            source_sentence,
            row[2],
            highlight_idx_source,
        )  # text, index of highlight words

        # Replace _ to get correct formatting
        # Original one having _ for correct word counting
        input_sentence = input_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
        source_sentence = source_sentence.replace(
            "span_style",
            "span style",
        ).replace("1px_4px", "1px 4px")
    else:
        input_sentence = row[0]["input"]
        source_sentence = row[0]["source"]

    url = row[0]["url"]

    # Displayed label and score by url
    filterby_url = text.grouped_url_df[text.grouped_url_df["url"] == url]
    if len(filterby_url) > 0:
        label = filterby_url["label"].values[0]
        score = filterby_url["score"].values[0]
    else:
        label = text.prediction_label[0]
        score = text.prediction_score[0]

    # Format displayed url
    source_text_url = f"""<a href="{url}">{url}</a>"""

    # Format displayed entity count
    entity_count_text = format_entity_count(entity_count)

    border_top = "border-top: 1px solid transparent;"
    border_bottom = "border-bottom: 1px solid transparent;"
    if first_url_row is True:
        # First & Last the group: no transparent
        if last_url_row is True:
            return f"""
<tr>
    <td>{input_sentence}</td>
    <td>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
        # First row of the group: transparent bottom border
        return f"""
<tr>
    <td style="{border_bottom}";>{input_sentence}</td>
    <td style="{border_bottom}";>{source_sentence}</td>
    <td rowspan="{span_row}">{label}<br>
    ({score * 100:.2f}%)<br><br>
    {entity_count_text}</td>
    <td rowspan="{span_row}"; style="{WORD_BREAK}";>{source_text_url}</td>
</tr>
"""
    else:
        if last_url_row is True:
            # NOT First row, Last row: transparent top border
            return f"""
<tr>
    <td style="{border_top}";>{input_sentence}</td>
    <td style="{border_top}";>{source_sentence}</td>
</tr>
"""
        else:
            # NOT First & NOT Last row: transparent top & bottom borders
            return f"""
<tr>
    <td style="{border_top} {border_bottom}";>{input_sentence}</td>
    <td style="{border_top} {border_bottom}";>{source_sentence}</td>
</tr>
"""


def format_image_fact_checker_row(image):
    if image.referent_url is not None or image.referent_url != "":
        source_image = f"""<img src="{image.referent_url}" width="100" height="150">"""  # noqa: E501
        source_image_url = f"""<a href="{image.referent_url}">{image.referent_url}</a>"""  # noqa: E501
    else:
        source_image = "Image not found"
        source_image_url = ""

    return f"""
<tr>
    <td>input image</td>
    <td>{source_image}</td>
    <td>{image.prediction_label}<br>({image.prediction_score:.2f}%)</td>
    <td style="{WORD_BREAK}";>{source_image_url}</td>
</tr>
"""