|
import streamlit as st |
|
from utils import get_tokenizer, get_tokenization, get_vocab_size |
|
import logging |
|
import pandas as pd |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
dataframe_path = "darija_tokenizers_leaderboard.jsonl" |
|
|
|
|
|
def calculate_metrics(tokenizer_id, text): |
|
logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}") |
|
try: |
|
tokenizer = get_tokenizer(tokenizer_id) |
|
tokens = get_tokenization(tokenizer, text) |
|
vocab_size = get_vocab_size(tokenizer) |
|
tokens_count = len(tokens) |
|
tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0 |
|
logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}") |
|
return { |
|
"Tokenizer": tokenizer_id, |
|
"Vocabulary Size": vocab_size, |
|
"Token Count": tokens_count, |
|
"Tokens/Character Ratio": tokens_ratio |
|
} |
|
except Exception as e: |
|
logging.error(f"Error processing {tokenizer_id}: {e}") |
|
st.error(f"Error processing {tokenizer_id}: {e}") |
|
return None |
|
|
|
def comparison_tab(df): |
|
st.header("Tokenizer Comparison") |
|
st.markdown("Compare two tokenizers side by side.") |
|
|
|
input_text = st.text_area("Enter text to compare:", "هذا مثال لنص بالدارجة المغربية") |
|
|
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
st.subheader("Tokenizer 1") |
|
tokenizer_1_choice = st.radio("Select Tokenizer 1 Source", ["From Leaderboard", "Enter New Model"], key="tokenizer_1_source") |
|
if tokenizer_1_choice == "From Leaderboard": |
|
model_1 = st.selectbox("Select Tokenizer 1", df["Tokenizer"].tolist(), key="model_1") |
|
else: |
|
model_1 = st.text_input("Enter Tokenizer 1 Name", key="model_1_input") |
|
if input_text and model_1: |
|
with st.spinner(f"Tokenizing with {model_1}..."): |
|
metrics = calculate_metrics(model_1, input_text) |
|
if metrics: |
|
st.write(f"**Vocabulary Size:** {metrics['Vocabulary Size']}") |
|
st.write(f"**Token Count:** {metrics['Token Count']}") |
|
st.write(f"**Tokens/Character Ratio:** {metrics['Tokens/Character Ratio']:.4f}") |
|
tokenizer = get_tokenizer(model_1) |
|
tokens = tokenizer.tokenize(input_text) |
|
tokens_html = ' '.join([f'<span style="background-color: #00BFFF; padding: 2px 5px; margin-right: 5px; border-radius: 3px;">{token}</span>' for token in tokens]) |
|
st.markdown(f'<div style="line-height: 2.5;">{tokens_html}</div>', unsafe_allow_html=True) |
|
|
|
with col2: |
|
st.subheader("Tokenizer 2") |
|
tokenizer_2_choice = st.radio("Select Tokenizer 2 Source", ["From Leaderboard", "Enter New Model"], key="tokenizer_2_source") |
|
if tokenizer_2_choice == "From Leaderboard": |
|
model_2 = st.selectbox("Select Tokenizer 2", df["Tokenizer"].tolist(), key="model_2") |
|
else: |
|
model_2 = st.text_input("Enter Tokenizer 2 Name", key="model_2_input") |
|
if input_text and model_2: |
|
with st.spinner(f"Tokenizing with {model_2}..."): |
|
metrics = calculate_metrics(model_2, input_text) |
|
if metrics: |
|
st.write(f"**Vocabulary Size:** {metrics['Vocabulary Size']}") |
|
st.write(f"**Token Count:** {metrics['Token Count']}") |
|
st.write(f"**Tokens/Character Ratio:** {metrics['Tokens/Character Ratio']:.4f}") |
|
tokenizer = get_tokenizer(model_2) |
|
tokens = tokenizer.tokenize(input_text) |
|
tokens_html = ' '.join([f'<span style="background-color: #FF1493; padding: 2px 5px; margin-right: 5px; border-radius: 3px;">{token}</span>' for token in tokens]) |
|
st.markdown(f'<div style="line-height: 2.5;">{tokens_html}</div>', unsafe_allow_html=True) |