from __future__ import annotations

"""Evidence localization & highlighting helpers.

This module provides a **single public API**:

    highlight_html_with_evidence(html_text: str, evidence: str, *, n: int = 3,
                                 overlap_threshold: float = 0.5) -> str

It will:
1. Extract quoted segments from *evidence* – if any exist they are treated as
   exact strings that must appear verbatim in *html_text* (case–insensitive).
2. If the evidence string had **no quotes**, we first try an exact match of the
   raw evidence text.  When that fails, we fall back to an n-gram overlap
   heuristic (default n = 3).  The window in *html_text* with ≥ *overlap_threshold*
   Jaccard overlap is considered a match.
3. All matched character spans are wrapped in `<mark>` tags.

The helper is HTML-agnostic – it simply operates on the raw string.  That means
it may occasionally highlight inside an HTML attribute if the evidence happens
to occur there, but for our Gradio viewer the relevant text lives in normal
content nodes so this trade-off keeps the implementation lightweight.

No try/excepts are used in accordance with user guidelines; we prefer clear
errors.
"""

from typing import List, Tuple
import re
import html

__all__ = [
    "localize_evidence",
    "highlight_html_with_evidence",
]


# ---------------------------------------------------------------------------
# Internal helpers -----------------------------------------------------------
# ---------------------------------------------------------------------------

def _extract_targets(evidence: str) -> List[str]:
    """Return the substrings we need to look for in *evidence*.

    1. If there are quoted regions – e.g. `"foo"` – each quoted region is
       returned separately **without** the quotes.
    2. Otherwise we return the full evidence string (stripped of whitespace).
    """
    if not evidence or evidence.strip() == "":
        return []

    # Pull out "quoted" substrings
    quoted = re.findall(r'"([^"\\]*(?:\\.[^"\\]*)*)"', evidence)
    return quoted if quoted else [evidence.strip()]


def _tokenize(text: str) -> List[str]:
    """A very small tokenizer – splits on word boundaries."""
    return re.findall(r"\b\w+\b", text.lower())


def _ngrams(tokens: List[str], n: int) -> List[str]:
    return [" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)]


# ---------------------------------------------------------------------------
# Public localisation logic --------------------------------------------------
# ---------------------------------------------------------------------------

def localize_evidence(
    text: str,
    evidence: str,
    *,
    n: int = 3,
    overlap_threshold: float = 0.5,
) -> List[Tuple[int, int]]:
    """Return a list of (start, end) indices where *evidence* occurs in *text*.

    If *evidence* contains quotes we treat the quoted substrings as exact
    matches (case-insensitive).

    When *evidence* has no quotes, we apply a simple n-gram Jaccard overlap to
    approximate the location.  The window with the highest overlap ≥
    *overlap_threshold* is returned.  Only a single window is selected in that
    fuzzy path to keep things deterministic.
    """
    if not evidence or evidence in {"N/A", "None"}:
        return []

    matches: List[Tuple[int, int]] = []
    targets = _extract_targets(evidence)

    # ------------------------------------------------------------------
    # 1. Exact search for each target (quoted or the raw evidence string)
    # ------------------------------------------------------------------
    lowered_text = text.lower()
    for target in targets:
        lowered_target = target.lower()
        for m in re.finditer(re.escape(lowered_target), lowered_text):
            matches.append(m.span())

    if matches:
        return _merge_overlaps(matches)

    # ---------------------------------------------------------------
    # 2. Fuzzy n-gram overlap if *evidence* had NO quotes and no exact
    #    substring match was detected above.
    # ---------------------------------------------------------------
    raw_target = targets[0]  # either the only quoted string, or the full evidence
    evid_tokens = _tokenize(raw_target)
    if not evid_tokens:
        return []

    n = min(n, len(evid_tokens))
    target_ngrams = set(_ngrams(evid_tokens, n))
    if not target_ngrams:
        return []

    # Tokenise *text* and keep char offsets for each token start
    token_spans: List[Tuple[str, int]] = [
        (m.group().lower(), m.start()) for m in re.finditer(r"\b\w+\b", text)
    ]
    if not token_spans:
        return []

    tokens_only = [tok for tok, _ in token_spans]
    window_size = len(evid_tokens)

    best_overlap = 0.0
    best_span: Tuple[int, int] | None = None

    for i in range(len(tokens_only) - window_size + 1):
        window_tokens = tokens_only[i : i + window_size]
        window_ngrams = set(_ngrams(window_tokens, n))
        if not window_ngrams:
            continue
        overlap = len(window_ngrams & target_ngrams) / float(len(target_ngrams))
        if overlap >= overlap_threshold and overlap > best_overlap:
            start_char = token_spans[i][1]
            end_char = token_spans[i + window_size - 1][1] + len(token_spans[i + window_size - 1][0])
            best_overlap = overlap
            best_span = (start_char, end_char)

    if best_span is not None:
        matches.append(best_span)

    return _merge_overlaps(matches)


# ---------------------------------------------------------------------------
# Highlighting ---------------------------------------------------------------
# ---------------------------------------------------------------------------

def _merge_overlaps(spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
    """Merge overlapping/adjacent spans."""
    if not spans:
        return []
    spans = sorted(spans, key=lambda x: x[0])
    merged = [spans[0]]
    for start, end in spans[1:]:
        last_start, last_end = merged[-1]
        if start <= last_end:  # overlapping or touching
            merged[-1] = (last_start, max(last_end, end))
        else:
            merged.append((start, end))
    return merged


def _insert_tags(text: str, spans: List[Tuple[int, int]], tag: str = "mark") -> str:
    """Return *text* with each span wrapped in `<tag>` .. `</tag>`.

    Assumes *spans* are non-overlapping **and sorted ascending**.
    """
    if not spans:
        return text

    parts: List[str] = []
    last_idx = 0
    for start, end in spans:
        parts.append(text[last_idx:start])
        parts.append(f"<{tag}>" + text[start:end] + f"</{tag}>")
        last_idx = end
    parts.append(text[last_idx:])
    return "".join(parts)


def highlight_html_with_evidence(
    html_text: str,
    evidence: str,
    *,
    n: int = 3,
    overlap_threshold: float = 0.5,
) -> str:
    """Return *html_text* with occurrences of *evidence* wrapped in `<mark>` tags."""
    if not evidence or evidence in {"N/A", "None"}:
        return html_text

    # Working on the raw HTML string directly – obtain spans first
    spans = localize_evidence(html_text, evidence, n=n, overlap_threshold=overlap_threshold)
    if not spans:
        return html_text  # nothing to highlight

    highlighted = _insert_tags(html_text, spans, tag="mark")

    # Inject tiny CSS once – id attribute prevents duplicates
    style_block = (
        "<style id=\"evidence-highlight-style\">\n"
        "mark { background: #fffd6b; font-weight: 600; padding: 0 2px; border-radius: 2px; }\n"
        "</style>\n"
    )
    if "evidence-highlight-style" not in html_text:
        highlighted = style_block + highlighted

    return highlighted