Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

iscc-sct / iscc_sct /demo.py

titusz

Synced repo using 'sync_with_huggingface' Github Action

73ab668 verified 7 months ago

raw

history blame

13.2 kB

	"""
	Gradio demo showcasing ISCC Semantic Text Code.
	"""

	from loguru import logger as log
	import gradio as gr
	import iscc_sct as sct
	import textwrap
	import yaml


	newline_symbols = {
	"\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol
	"\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol
	"\u000c": "␌", # Form Feed - Unicode Control Pictures representation
	"\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol
	"\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol
	"\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol
	"\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol
	}


	def no_nl(text):
	"""Replace non-printable newline characters with printable symbols"""
	for char, symbol in newline_symbols.items():
	text = text.replace(char, symbol)
	return text


	def no_nl_inner(text):
	"""Replace non-printable newline characters with printable symbols, ignoring leading and
	trailing newlines"""
	# Strip leading and trailing whitespace
	stripped_text = text.strip()

	# Replace newline characters within the text
	for char, symbol in newline_symbols.items():
	stripped_text = stripped_text.replace(char, symbol)

	# Add back the leading and trailing newlines
	leading_newlines = len(text) - len(text.lstrip())
	trailing_newlines = len(text) - len(text.rstrip())

	return "\n" * leading_newlines + stripped_text + "\n" * trailing_newlines


	def clean_chunk(chunk):
	"""Strip consecutive line breaks in text to a maximum of 2."""
	return chunk.replace("\n\n", "\n")


	def compute_iscc_code(text1, text2, bit_length):
	code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
	code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
	similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
	return code1["iscc"], code2["iscc"], similarity


	def compare_codes(code_a, code_b, bits):
	if all([code_a, code_b]):
	return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))


	def truncate_text(text, max_length=70):
	return textwrap.shorten(text, width=max_length, placeholder="...")


	def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
	"""Aproximate the cosine similarity for a given hamming distance and dimension"""
	result = 1 - (2 * hamming_distance) / dim
	return result


	def generate_similarity_bar(similarity):
	"""Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
	# Scale similarity from [-1, 1] to [-100, 100]
	display_similarity = similarity * 100

	# Calculate the width of the bar based on the absolute value of similarity
	bar_width = int(abs(similarity) * 50) # 50% is half the width of the container

	# Determine the color and starting position based on the sign of the similarity
	color = "green" if similarity >= 0 else "red"
	position = "left" if similarity >= 0 else "right"

	# Adjust the text position to be centered within the colored bar
	text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
	text_alignment = (
	"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
	)

	bar_html = f"""
	<h3>Semantic Similarity</h3>
	<div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
	<div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
	<span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
	</div>
	</div>
	"""
	return bar_html


	def load_samples():
	with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
	return yaml.safe_load(file)["samples"]


	samples = load_samples()

	custom_css = """
	"""

	iscc_theme = gr.themes.Default(
	font=[gr.themes.GoogleFont("Readex Pro")],
	font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
	radius_size=gr.themes.sizes.radius_none,
	)

	with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
	with gr.Row(variant="panel"):
	gr.Markdown(
	"""
	## ✂️ ISCC Semantic Text-Code
	Demo of cross-lingual Semantic Text-Code (proof of concept)
	""",
	)
	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	sample_dropdown_a = gr.Dropdown(
	choices=["None"] + [lang for lang in samples["a"]],
	label="Select sample for Text A",
	value="None",
	)
	with gr.Column(variant="panel"):
	sample_dropdown_b = gr.Dropdown(
	choices=["None"] + [lang for lang in samples["b"]],
	label="Select sample for Text B",
	value="None",
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	in_text_a = gr.TextArea(
	label="Text A",
	placeholder="Choose sample text from the dropdown above or type or paste your text.",
	lines=12,
	max_lines=12,
	)
	out_code_a = gr.Textbox(label="ISCC Code for Text A")
	with gr.Column(variant="panel"):
	in_text_b = gr.TextArea(
	label="Text B",
	placeholder="Choose sample text from the dropdown above or type or paste your text.",
	lines=12,
	max_lines=12,
	)
	out_code_b = gr.Textbox(label="ISCC Code for Text B")

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	out_similarity = gr.HTML(label="Similarity")

	with gr.Row(variant="panel"):
	in_iscc_bits = gr.Slider(
	label="ISCC Bit-Length",
	info="NUMBER OF BITS FOR OUTPUT ISCC",
	minimum=64,
	maximum=256,
	step=32,
	value=64,
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	out_chunks_a = gr.HighlightedText(
	label="Chunked Text A",
	interactive=False,
	elem_id="chunked-text-a",
	)
	with gr.Column(variant="panel"):
	out_chunks_b = gr.HighlightedText(
	label="Chunked Text B",
	interactive=False,
	elem_id="chunked-text-b",
	)

	def update_sample_text(choice, group):
	if choice == "None":
	return ""
	return samples[group][choice]

	sample_dropdown_a.change(
	lambda choice: update_sample_text(choice, "a"),
	inputs=[sample_dropdown_a],
	outputs=[in_text_a],
	)
	sample_dropdown_b.change(
	lambda choice: update_sample_text(choice, "b"),
	inputs=[sample_dropdown_b],
	outputs=[in_text_b],
	)

	def process_text(text, nbits, suffix):
	log.debug(f"{text[:20]}")
	out_code_func = globals().get(f"out_code_{suffix}")
	out_chunks_func = globals().get(f"out_chunks_{suffix}")

	if not text:
	return {
	out_code_func: gr.Textbox(value=None),
	out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
	}

	result = sct.gen_text_code_semantic(
	text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
	)
	iscc = sct.Metadata(**result).to_object_format()

	# Generate chunked text with simprints and overlaps
	features = iscc.features[0]
	highlighted_chunks = []
	overlaps = iscc.get_overlaps()

	for i, feature in enumerate(features.simprints):
	feature: sct.Feature
	content = feature.content

	# Remove leading overlap
	if i > 0 and overlaps[i - 1]:
	content = content[len(overlaps[i - 1]) :]

	# Remove trailing overlap
	if i < len(overlaps) and overlaps[i]:
	content = content[: -len(overlaps[i])]

	label = f"{feature.size}:{feature.simprint}"
	highlighted_chunks.append((no_nl_inner(content), label))

	if i < len(overlaps):
	overlap = overlaps[i]
	if overlap:
	highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap"))

	return {
	out_code_func: gr.Textbox(value=iscc.iscc),
	out_chunks_func: gr.HighlightedText(value=highlighted_chunks, elem_id="chunked-text"),
	}

	def recalculate_iscc(text_a, text_b, nbits):
	code_a = sct.gen_text_code_semantic(text_a, bits=nbits)["iscc"] if text_a else None
	code_b = sct.gen_text_code_semantic(text_b, bits=nbits)["iscc"] if text_b else None

	if code_a and code_b:
	similarity = compare_codes(code_a, code_b, nbits)
	else:
	similarity = None

	return (
	gr.Textbox(value=code_a) if code_a else gr.Textbox(),
	gr.Textbox(value=code_b) if code_b else gr.Textbox(),
	similarity,
	)

	in_text_a.change(
	lambda text, nbits: process_text(text, nbits, "a"),
	inputs=[in_text_a, in_iscc_bits],
	outputs=[out_code_a, out_chunks_a],
	show_progress="full",
	trigger_mode="always_last",
	)
	in_text_b.change(
	lambda text, nbits: process_text(text, nbits, "b"),
	inputs=[in_text_b, in_iscc_bits],
	outputs=[out_code_b, out_chunks_b],
	show_progress="full",
	trigger_mode="always_last",
	)

	in_iscc_bits.change(
	recalculate_iscc,
	inputs=[in_text_a, in_text_b, in_iscc_bits],
	outputs=[out_code_a, out_code_b, out_similarity],
	show_progress="full",
	)

	out_code_a.change(
	compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
	)
	out_code_b.change(
	compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
	)

	def reset_all():
	return (
	gr.Slider(value=128), # Reset ISCC Bit-Length
	gr.Dropdown(
	value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
	), # Reset sample dropdown A
	gr.Dropdown(
	value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
	), # Reset sample dropdown B
	gr.TextArea(value=""), # Reset Text A
	gr.TextArea(value=""), # Reset Text B
	gr.Textbox(value=""), # Reset ISCC Code for Text A
	gr.Textbox(value=""), # Reset ISCC Code for Text B
	gr.HTML(value=""), # Reset Similarity
	gr.HighlightedText(value=[]), # Reset Chunked Text A
	gr.HighlightedText(value=[]), # Reset Chunked Text B
	)

	with gr.Row(variant="panel"):
	reset_button = gr.Button("Reset All")

	reset_button.click(
	reset_all,
	outputs=[
	in_iscc_bits,
	sample_dropdown_a,
	sample_dropdown_b,
	in_text_a,
	in_text_b,
	out_code_a,
	out_code_b,
	out_similarity,
	out_chunks_a,
	out_chunks_b,
	],
	)

	with gr.Row(variant="panel"):
	with gr.Column(variant="panel"):
	gr.Markdown(
	"""
	## Understanding ISCC Semantic Text-Codes

	### What is an ISCC Semantic Text-Code?
	An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text,
	not just the exact words.

	### How does it work?
	1. Input: You provide a text in any language.
	2. Processing: Our system analyzes the meaning of the text.
	3. Output: A unique code is generated that represents the text's content.

	### What can it do?
	- Cross-language matching: It can recognize similar content across different languages.
	- Similarity detection: It can measure how similar two texts are in meaning, not just in words.
	- Content identification: It can help identify texts with similar content, even if the wording is different.

	### How to use this demo:
	1. Enter text: Type or paste text into either or both text boxes.
	2. Adjust bit length: Use the slider to change the detail level of the code (higher = more detailed).
	3. View results: See the generated ISCC code for each text.
	4. Compare: Look at the similarity bar to see how alike the two texts are in meaning.

	### Why is this useful?
	- Content creators: Find similar content across languages.
	- Researchers: Quickly compare documents or find related texts in different languages.
	- Publishers: Identify potential translations or similar works efficiently.

	This technology opens up new possibilities for understanding and managing text content across language barriers!
	"""
	)


	if __name__ == "__main__": # pragma: no cover
	demo.launch()