File size: 5,143 Bytes
00062a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f0a734
00062a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import streamlit as st
import pandas as pd
from utils import get_tokenizer, get_tokenization, get_vocab_size, check_latin_support
import logging
import matplotlib.pyplot as plt

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

dataframe_path = "darija_tokenizers_leaderboard.jsonl"


def calculate_metrics(tokenizer_id, text):
    logging.debug(f"Calculating metrics for tokenizer: {tokenizer_id}")
    try:
        tokenizer = get_tokenizer(tokenizer_id)
        tokens = get_tokenization(tokenizer, text)
        vocab_size = get_vocab_size(tokenizer)
        tokens_count = len(tokens)
        tokens_ratio = tokens_count / len(text) if len(text) > 0 else 0
        latin_support = check_latin_support(tokenizer)
        tokenizer_class = tokenizer.__class__.__name__
        logging.debug(f"Metrics calculated: vocab_size={vocab_size}, tokens_count={tokens_count}, tokens_ratio={tokens_ratio}, latin_support={latin_support}, tokenizer_class={tokenizer_class}")
        return {
            "Tokenizer": tokenizer_id,
            "Vocabulary Size": vocab_size,
            "Token Count": tokens_count,
            "Tokens/Character Ratio": tokens_ratio,
            "Latin Support": latin_support,
            "Tokenizer Class": tokenizer_class
        }
    except Exception as e:
        logging.error(f"Error processing {tokenizer_id}: {e}")
        st.error(f"Error processing {tokenizer_id}: {e}")
        return None

leaderboard_description = """
The `Tokens/Character Ratio` in this leaderboard is based on the number of tokens generated by the tokenizer divided by the number of characters in the input text.

**A tokenizer that scores low in this leaderboard is considered more efficient in parsing Darija text.**

## Notes:
1.  `Vocabulary Size` is the total number of tokens in the tokenizer's vocabulary.
2.  `Token Count` is the total number of tokens generated by the tokenizer for the input text.
3.  `Tokens/Character Ratio` is the ratio of the number of tokens to the number of characters in the input text.
4.  `Latin Support` indicates whether the tokenizer can handle Latin characters in addition to Arabic characters (`โœ…` for yes, `โŒ` for no).
5.  `Tokenizer Class` is the class of the tokenizer (e.g. `BertTokenizer` or `GPT2Tokenizer`).
"""

def leaderboard_tab(df):
    st.header("Darija Tokenizers Leaderboard ๐Ÿ‡ฒ๐Ÿ‡ฆ")
    st.markdown("A comparison of different tokenizers for the Moroccan Darija dialect.")
    st.markdown(leaderboard_description)

    df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
    st.dataframe(df, hide_index=True) # Table first

    model_name = st.text_input("Enter a tokenizer name from Hugging Face (e.g. google/gemma-2-27b-it)")
    col1, col2 = st.columns([3,1])
    with col1:
        submit_new_model_btn = st.button("Submit New Model", type="primary")
    with col2:
        refresh_btn = st.button("Refresh", type="secondary")

    if submit_new_model_btn:
        if model_name and "Tokenizer" in df.columns and model_name in df["Tokenizer"].values:
            st.warning("This model is already in the leaderboard.")
        else:
            with st.spinner(f"Benchmarking {model_name}..."):
                input_text = "ุงู„ู„ู„ูŠ ุงูƒุชุดููˆุง ุงู„ุณุงุญู„ ุงู„ุบุฑุจูŠ ู„ุฃู…ุฑูŠูƒุง ุงู„ุดู…ุงู„ูŠุฉ"
                try:
                    tokenizer = get_tokenizer(model_name)
                    tokens = tokenizer.tokenize(input_text)
                    if all(token == tokenizer.unk_token for token in tokens):
                        st.error(f"Tokenizer {model_name} does not support Arabic characters.")
                        return
                    benchmark_data = calculate_metrics(model_name, input_text)
                    if benchmark_data:
                        df = df._append(benchmark_data, ignore_index=True)
                        df = df.sort_values(by="Tokens/Character Ratio", ascending=True)
                        df.to_json(dataframe_path, lines=True, orient="records", force_ascii=False)
                        st.success(f"Model {model_name} added to the leaderboard.")
                        st.rerun() # Refresh the page
                    else:
                        st.error(f"Failed to benchmark {model_name}")
                except Exception as e:
                    st.error(f"Error loading tokenizer {model_name}: {e}")
    if refresh_btn:
        try:
            df = pd.read_json(dataframe_path, lines=True)
            st.success("Leaderboard refreshed.")
        except:
            st.error("Failed to refresh leaderboard.")

    with st.expander("Tokens/Character Ratio Barplot (Lower is Better)", expanded=False): # Barplot last
        fig = plt.figure(figsize=(10, 6))
        # Sort by ratio ascending since lower is better
        sorted_df = df.sort_values("Tokens/Character Ratio", ascending=True)
        plt.bar(sorted_df["Tokenizer"], sorted_df["Tokens/Character Ratio"])
        plt.xticks(rotation=45, ha='right')
        plt.ylabel("Tokens/Character Ratio")
        plt.tight_layout()
        st.pyplot(fig)