Spaces:
Sleeping
Sleeping
import streamlit as st | |
from tokenizer import HindiTokenizer | |
# Initialize tokenizer | |
def load_tokenizer(): | |
return HindiTokenizer() | |
def format_token_ids(token_ids): | |
# Format token IDs in a readable way, 10 per line | |
lines = [] | |
for i in range(0, len(token_ids), 10): | |
line = token_ids[i:i+10] | |
lines.append(' '.join(str(id) for id in line)) | |
return '\n'.join(lines) | |
def format_hindi_tokens(tokens): | |
# Join tokens with double spaces | |
return ' '.join(tokens) | |
def main(): | |
st.title("Hindi Text Tokenizer") | |
tokenizer = load_tokenizer() | |
# Create columns for metrics | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.subheader("Word Count") | |
with col2: | |
st.subheader("Compression Ratio") | |
with col3: | |
st.subheader("BPE Tokens") # Renamed to clarify these are post-BPE tokens | |
# Text input | |
st.subheader("Input Text:") | |
text_input = st.text_area( | |
label="Input Hindi text", | |
height=150, | |
key="input", | |
label_visibility="collapsed" | |
) | |
if st.button("Tokenize"): | |
if text_input: | |
# Get tokens and IDs | |
token_ids, original_tokens, decoded_tokens = tokenizer.tokenize(text_input) | |
# Calculate metrics | |
word_count = len(text_input.split()) | |
original_bytes = sum(len(token.encode('utf-8')) for token in original_tokens) | |
compression_ratio = original_bytes / len(token_ids) | |
# Update metrics | |
col1.write(f"{word_count}") | |
col2.write(f"{compression_ratio:.2f}X") | |
col3.write(f"{len(token_ids)}") # This is post-BPE token count | |
# Optional: Display both token counts for comparison | |
st.caption(f"Initial tokens (after regex): {len(original_tokens)}") | |
st.caption(f"Final tokens (after BPE): {len(token_ids)}") | |
# Display token IDs in a formatted way | |
st.subheader("Token IDs:") | |
st.text_area( | |
label="Generated token IDs", | |
value=format_token_ids(token_ids), | |
height=150, | |
key="ids", | |
label_visibility="collapsed" | |
) | |
# Display decoded tokens with tab separation | |
st.subheader("Tokenized Text:") | |
st.text_area( | |
label="Tokenized output", | |
value='\t'.join(decoded_tokens), | |
height=150, | |
key="tokens", | |
label_visibility="collapsed" | |
) | |
else: | |
st.warning("Please enter some text to tokenize.") | |
if __name__ == "__main__": | |
main() |