Spaces:

iscc
/

iscc-sct

Running

File size: 13,172 Bytes

"""
Gradio demo showcasing ISCC Semantic Text Code.
"""

from loguru import logger as log
import gradio as gr
import iscc_sct as sct
import textwrap
import yaml


newline_symbols = {
    "\u000a": "⏎",  # Line Feed - Represented by the 'Return' symbol
    "\u000b": "↨",  # Vertical Tab - Represented by the 'Up Down Arrow' symbol
    "\u000c": "␌",  # Form Feed - Unicode Control Pictures representation
    "\u000d": "↵",  # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
    "\u0085": "⤓",  # Next Line - 'Downwards Arrow with Double Stroke' symbol
    "\u2028": "↲",  # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
    "\u2029": "¶",  # Paragraph Separator - Represented by the 'Pilcrow' symbol
}


def no_nl(text):
    """Replace non-printable newline characters with printable symbols"""
    for char, symbol in newline_symbols.items():
        text = text.replace(char, symbol)
    return text


def no_nl_inner(text):
    """Replace non-printable newline characters with printable symbols, ignoring leading and
    trailing newlines"""
    # Strip leading and trailing whitespace
    stripped_text = text.strip()

    # Replace newline characters within the text
    for char, symbol in newline_symbols.items():
        stripped_text = stripped_text.replace(char, symbol)

    # Add back the leading and trailing newlines
    leading_newlines = len(text) - len(text.lstrip())
    trailing_newlines = len(text) - len(text.rstrip())

    return "\n" * leading_newlines + stripped_text + "\n" * trailing_newlines


def clean_chunk(chunk):
    """Strip consecutive line breaks in text to a maximum of 2."""
    return chunk.replace("\n\n", "\n")


def compute_iscc_code(text1, text2, bit_length):
    code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
    code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
    similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
    return code1["iscc"], code2["iscc"], similarity


def compare_codes(code_a, code_b, bits):
    if all([code_a, code_b]):
        return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))


def truncate_text(text, max_length=70):
    return textwrap.shorten(text, width=max_length, placeholder="...")


def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
    """Aproximate the cosine similarity for a given hamming distance and dimension"""
    result = 1 - (2 * hamming_distance) / dim
    return result


def generate_similarity_bar(similarity):
    """Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
    # Scale similarity from [-1, 1] to [-100, 100]
    display_similarity = similarity * 100

    # Calculate the width of the bar based on the absolute value of similarity
    bar_width = int(abs(similarity) * 50)  # 50% is half the width of the container

    # Determine the color and starting position based on the sign of the similarity
    color = "green" if similarity >= 0 else "red"
    position = "left" if similarity >= 0 else "right"

    # Adjust the text position to be centered within the colored bar
    text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
    text_alignment = (
        "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
    )

    bar_html = f"""
    <h3>Semantic Similarity</h3>
    <div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
        <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
            <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
        </div>
    </div>
    """
    return bar_html


def load_samples():
    with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
        return yaml.safe_load(file)["samples"]


samples = load_samples()

custom_css = """
"""

iscc_theme = gr.themes.Default(
    font=[gr.themes.GoogleFont("Readex Pro")],
    font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
    radius_size=gr.themes.sizes.radius_none,
)

with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
    with gr.Row(variant="panel"):
        gr.Markdown(
            """
        ## ✂️ ISCC Semantic Text-Code
        Demo of cross-lingual Semantic Text-Code (proof of concept)
        """,
        )
    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            sample_dropdown_a = gr.Dropdown(
                choices=["None"] + [lang for lang in samples["a"]],
                label="Select sample for Text A",
                value="None",
            )
        with gr.Column(variant="panel"):
            sample_dropdown_b = gr.Dropdown(
                choices=["None"] + [lang for lang in samples["b"]],
                label="Select sample for Text B",
                value="None",
            )

    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            in_text_a = gr.TextArea(
                label="Text A",
                placeholder="Choose sample text from the dropdown above or type or paste your text.",
                lines=12,
                max_lines=12,
            )
            out_code_a = gr.Textbox(label="ISCC Code for Text A")
        with gr.Column(variant="panel"):
            in_text_b = gr.TextArea(
                label="Text B",
                placeholder="Choose sample text from the dropdown above or type or paste your text.",
                lines=12,
                max_lines=12,
            )
            out_code_b = gr.Textbox(label="ISCC Code for Text B")

    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            out_similarity = gr.HTML(label="Similarity")

    with gr.Row(variant="panel"):
        in_iscc_bits = gr.Slider(
            label="ISCC Bit-Length",
            info="NUMBER OF BITS FOR OUTPUT ISCC",
            minimum=64,
            maximum=256,
            step=32,
            value=64,
        )

    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            out_chunks_a = gr.HighlightedText(
                label="Chunked Text A",
                interactive=False,
                elem_id="chunked-text-a",
            )
        with gr.Column(variant="panel"):
            out_chunks_b = gr.HighlightedText(
                label="Chunked Text B",
                interactive=False,
                elem_id="chunked-text-b",
            )

    def update_sample_text(choice, group):
        if choice == "None":
            return ""
        return samples[group][choice]

    sample_dropdown_a.change(
        lambda choice: update_sample_text(choice, "a"),
        inputs=[sample_dropdown_a],
        outputs=[in_text_a],
    )
    sample_dropdown_b.change(
        lambda choice: update_sample_text(choice, "b"),
        inputs=[sample_dropdown_b],
        outputs=[in_text_b],
    )

    def process_text(text, nbits, suffix):
        log.debug(f"{text[:20]}")
        out_code_func = globals().get(f"out_code_{suffix}")
        out_chunks_func = globals().get(f"out_chunks_{suffix}")

        if not text:
            return {
                out_code_func: gr.Textbox(value=None),
                out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
            }

        result = sct.gen_text_code_semantic(
            text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
        )
        iscc = sct.Metadata(**result).to_object_format()

        # Generate chunked text with simprints and overlaps
        features = iscc.features[0]
        highlighted_chunks = []
        overlaps = iscc.get_overlaps()

        for i, feature in enumerate(features.simprints):
            feature: sct.Feature
            content = feature.content

            # Remove leading overlap
            if i > 0 and overlaps[i - 1]:
                content = content[len(overlaps[i - 1]) :]

            # Remove trailing overlap
            if i < len(overlaps) and overlaps[i]:
                content = content[: -len(overlaps[i])]

            label = f"{feature.size}:{feature.simprint}"
            highlighted_chunks.append((no_nl_inner(content), label))

            if i < len(overlaps):
                overlap = overlaps[i]
                if overlap:
                    highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))

        return {
            out_code_func: gr.Textbox(value=iscc.iscc),
            out_chunks_func: gr.HighlightedText(value=highlighted_chunks, elem_id="chunked-text"),
        }

    def recalculate_iscc(text_a, text_b, nbits):
        code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
        code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None

        if code_a and code_b:
            similarity = compare_codes(code_a, code_b, nbits)
        else:
            similarity = None

        return (
            gr.Textbox(value=code_a) if code_a else gr.Textbox(),
            gr.Textbox(value=code_b) if code_b else gr.Textbox(),
            similarity,
        )

    in_text_a.change(
        lambda text, nbits: process_text(text, nbits, "a"),
        inputs=[in_text_a, in_iscc_bits],
        outputs=[out_code_a, out_chunks_a],
        show_progress="full",
        trigger_mode="always_last",
    )
    in_text_b.change(
        lambda text, nbits: process_text(text, nbits, "b"),
        inputs=[in_text_b, in_iscc_bits],
        outputs=[out_code_b, out_chunks_b],
        show_progress="full",
        trigger_mode="always_last",
    )

    in_iscc_bits.change(
        recalculate_iscc,
        inputs=[in_text_a, in_text_b, in_iscc_bits],
        outputs=[out_code_a, out_code_b, out_similarity],
        show_progress="full",
    )

    out_code_a.change(
        compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
    )
    out_code_b.change(
        compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
    )

    def reset_all():
        return (
            gr.Slider(value=128),  # Reset ISCC Bit-Length
            gr.Dropdown(
                value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
            ),  # Reset sample dropdown A
            gr.Dropdown(
                value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
            ),  # Reset sample dropdown B
            gr.TextArea(value=""),  # Reset Text A
            gr.TextArea(value=""),  # Reset Text B
            gr.Textbox(value=""),  # Reset ISCC Code for Text A
            gr.Textbox(value=""),  # Reset ISCC Code for Text B
            gr.HTML(value=""),  # Reset Similarity
            gr.HighlightedText(value=[]),  # Reset Chunked Text A
            gr.HighlightedText(value=[]),  # Reset Chunked Text B
        )

    with gr.Row(variant="panel"):
        reset_button = gr.Button("Reset All")

    reset_button.click(
        reset_all,
        outputs=[
            in_iscc_bits,
            sample_dropdown_a,
            sample_dropdown_b,
            in_text_a,
            in_text_b,
            out_code_a,
            out_code_b,
            out_similarity,
            out_chunks_a,
            out_chunks_b,
        ],
    )

    with gr.Row(variant="panel"):
        with gr.Column(variant="panel"):
            gr.Markdown(
                """
## Understanding ISCC Semantic Text-Codes

### What is an ISCC Semantic Text-Code?
An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
not just the exact words.

### How does it work?
1. **Input**: You provide a text in any language.
2. **Processing**: Our system analyzes the meaning of the text.
3. **Output**: A unique code is generated that represents the text's content.

### What can it do?
- **Cross-language matching**: It can recognize similar content across different languages.
- **Similarity detection**: It can measure how similar two texts are in meaning, not just in words.
- **Content identification**: It can help identify texts with similar content, even if the wording is different.

### How to use this demo:
1. **Enter text**: Type or paste text into either or both text boxes.
2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed).
3. **View results**: See the generated ISCC code for each text.
4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning.

### Why is this useful?
- **Content creators**: Find similar content across languages.
- **Researchers**: Quickly compare documents or find related texts in different languages.
- **Publishers**: Identify potential translations or similar works efficiently.

This technology opens up new possibilities for understanding and managing text content across language barriers!
"""
            )


if __name__ == "__main__":  # pragma: no cover
    demo.launch()