from __future__ import annotations """Evidence localization & highlighting helpers. This module provides a **single public API**: highlight_html_with_evidence(html_text: str, evidence: str, *, n: int = 3, overlap_threshold: float = 0.5) -> str It will: 1. Extract quoted segments from *evidence* – if any exist they are treated as exact strings that must appear verbatim in *html_text* (case–insensitive). 2. If the evidence string had **no quotes**, we first try an exact match of the raw evidence text. When that fails, we fall back to an n-gram overlap heuristic (default n = 3). The window in *html_text* with ≥ *overlap_threshold* Jaccard overlap is considered a match. 3. All matched character spans are wrapped in `` tags. The helper is HTML-agnostic – it simply operates on the raw string. That means it may occasionally highlight inside an HTML attribute if the evidence happens to occur there, but for our Gradio viewer the relevant text lives in normal content nodes so this trade-off keeps the implementation lightweight. No try/excepts are used in accordance with user guidelines; we prefer clear errors. """ from typing import List, Tuple import re import html __all__ = [ "localize_evidence", "highlight_html_with_evidence", ] # --------------------------------------------------------------------------- # Internal helpers ----------------------------------------------------------- # --------------------------------------------------------------------------- def _extract_targets(evidence: str) -> List[str]: """Return the substrings we need to look for in *evidence*. 1. If there are quoted regions – e.g. `"foo"` – each quoted region is returned separately **without** the quotes. 2. Otherwise we return the full evidence string (stripped of whitespace). """ if not evidence or evidence.strip() == "": return [] # Pull out "quoted" substrings quoted = re.findall(r'"([^"\\]*(?:\\.[^"\\]*)*)"', evidence) return quoted if quoted else [evidence.strip()] def _tokenize(text: str) -> List[str]: """A very small tokenizer – splits on word boundaries.""" return re.findall(r"\b\w+\b", text.lower()) def _ngrams(tokens: List[str], n: int) -> List[str]: return [" ".join(tokens[i : i + n]) for i in range(len(tokens) - n + 1)] # --------------------------------------------------------------------------- # Public localisation logic -------------------------------------------------- # --------------------------------------------------------------------------- def localize_evidence( text: str, evidence: str, *, n: int = 3, overlap_threshold: float = 0.5, ) -> List[Tuple[int, int]]: """Return a list of (start, end) indices where *evidence* occurs in *text*. If *evidence* contains quotes we treat the quoted substrings as exact matches (case-insensitive). When *evidence* has no quotes, we apply a simple n-gram Jaccard overlap to approximate the location. The window with the highest overlap ≥ *overlap_threshold* is returned. Only a single window is selected in that fuzzy path to keep things deterministic. """ if not evidence or evidence in {"N/A", "None"}: return [] matches: List[Tuple[int, int]] = [] targets = _extract_targets(evidence) # ------------------------------------------------------------------ # 1. Exact search for each target (quoted or the raw evidence string) # ------------------------------------------------------------------ lowered_text = text.lower() for target in targets: lowered_target = target.lower() for m in re.finditer(re.escape(lowered_target), lowered_text): matches.append(m.span()) if matches: return _merge_overlaps(matches) # --------------------------------------------------------------- # 2. Fuzzy n-gram overlap if *evidence* had NO quotes and no exact # substring match was detected above. # --------------------------------------------------------------- raw_target = targets[0] # either the only quoted string, or the full evidence evid_tokens = _tokenize(raw_target) if not evid_tokens: return [] n = min(n, len(evid_tokens)) target_ngrams = set(_ngrams(evid_tokens, n)) if not target_ngrams: return [] # Tokenise *text* and keep char offsets for each token start token_spans: List[Tuple[str, int]] = [ (m.group().lower(), m.start()) for m in re.finditer(r"\b\w+\b", text) ] if not token_spans: return [] tokens_only = [tok for tok, _ in token_spans] window_size = len(evid_tokens) best_overlap = 0.0 best_span: Tuple[int, int] | None = None for i in range(len(tokens_only) - window_size + 1): window_tokens = tokens_only[i : i + window_size] window_ngrams = set(_ngrams(window_tokens, n)) if not window_ngrams: continue overlap = len(window_ngrams & target_ngrams) / float(len(target_ngrams)) if overlap >= overlap_threshold and overlap > best_overlap: start_char = token_spans[i][1] end_char = token_spans[i + window_size - 1][1] + len(token_spans[i + window_size - 1][0]) best_overlap = overlap best_span = (start_char, end_char) if best_span is not None: matches.append(best_span) return _merge_overlaps(matches) # --------------------------------------------------------------------------- # Highlighting --------------------------------------------------------------- # --------------------------------------------------------------------------- def _merge_overlaps(spans: List[Tuple[int, int]]) -> List[Tuple[int, int]]: """Merge overlapping/adjacent spans.""" if not spans: return [] spans = sorted(spans, key=lambda x: x[0]) merged = [spans[0]] for start, end in spans[1:]: last_start, last_end = merged[-1] if start <= last_end: # overlapping or touching merged[-1] = (last_start, max(last_end, end)) else: merged.append((start, end)) return merged def _insert_tags(text: str, spans: List[Tuple[int, int]], tag: str = "mark") -> str: """Return *text* with each span wrapped in `` .. ``. Assumes *spans* are non-overlapping **and sorted ascending**. """ if not spans: return text parts: List[str] = [] last_idx = 0 for start, end in spans: parts.append(text[last_idx:start]) parts.append(f"<{tag}>" + text[start:end] + f"") last_idx = end parts.append(text[last_idx:]) return "".join(parts) def highlight_html_with_evidence( html_text: str, evidence: str, *, n: int = 3, overlap_threshold: float = 0.5, ) -> str: """Return *html_text* with occurrences of *evidence* wrapped in `` tags.""" if not evidence or evidence in {"N/A", "None"}: return html_text # Working on the raw HTML string directly – obtain spans first spans = localize_evidence(html_text, evidence, n=n, overlap_threshold=overlap_threshold) if not spans: return html_text # nothing to highlight highlighted = _insert_tags(html_text, spans, tag="mark") # Inject tiny CSS once – id attribute prevents duplicates style_block = ( "\n" ) if "evidence-highlight-style" not in html_text: highlighted = style_block + highlighted return highlighted