File size: 2,850 Bytes
af587c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import streamlit as st
from tokenizer import HindiTokenizer

# Initialize tokenizer
@st.cache_resource
def load_tokenizer():
    return HindiTokenizer()

def format_token_ids(token_ids):
    # Format token IDs in a readable way, 10 per line
    lines = []
    for i in range(0, len(token_ids), 10):
        line = token_ids[i:i+10]
        lines.append(' '.join(str(id) for id in line))
    return '\n'.join(lines)

def format_hindi_tokens(tokens):
    # Join tokens with double spaces
    return '  '.join(tokens)

def main():
    st.title("Hindi Text Tokenizer")
    
    tokenizer = load_tokenizer()
    
    # Create columns for metrics
    col1, col2, col3 = st.columns(3)
    
    with col1:
        st.subheader("Word Count")
    with col2:
        st.subheader("Compression Ratio")
    with col3:
        st.subheader("BPE Tokens")  # Renamed to clarify these are post-BPE tokens
    
    # Text input
    st.subheader("Input Text:")
    text_input = st.text_area(
        label="Input Hindi text", 
        height=150, 
        key="input",
        label_visibility="collapsed"
    )
    
    if st.button("Tokenize"):
        if text_input:
            # Get tokens and IDs
            token_ids, original_tokens, decoded_tokens = tokenizer.tokenize(text_input)
            
            # Calculate metrics
            word_count = len(text_input.split())
            original_bytes = sum(len(token.encode('utf-8')) for token in original_tokens)
            compression_ratio = original_bytes / len(token_ids)
            
            # Update metrics
            col1.write(f"{word_count}")
            col2.write(f"{compression_ratio:.2f}X")
            col3.write(f"{len(token_ids)}")  # This is post-BPE token count
            
            # Optional: Display both token counts for comparison
            st.caption(f"Initial tokens (after regex): {len(original_tokens)}")
            st.caption(f"Final tokens (after BPE): {len(token_ids)}")
            
            # Display token IDs in a formatted way
            st.subheader("Token IDs:")
            st.text_area(
                label="Generated token IDs",
                value=format_token_ids(token_ids), 
                height=150, 
                key="ids",
                label_visibility="collapsed"
            )
            
            # Display decoded tokens with tab separation
            st.subheader("Tokenized Text:")
            st.text_area(
                label="Tokenized output",
                value='\t'.join(decoded_tokens), 
                height=150, 
                key="tokens",
                label_visibility="collapsed"
            )
        else:
            st.warning("Please enter some text to tokenize.")

if __name__ == "__main__":
    main()