Spaces:
Running
Running
| import gradio as gr | |
| import pkg_resources | |
| from turkish_tokenizer import TokenType, TurkishTokenizer | |
| # Get the version from the installed package | |
| try: | |
| VERSION = pkg_resources.get_distribution("turkish-tokenizer").version | |
| except: | |
| VERSION = "unknown" | |
| tokenizer = TurkishTokenizer() | |
| # Define colors for each token type | |
| color_map = { | |
| TokenType.ROOT.name: "#FF6B6B", # Red | |
| TokenType.SUFFIX.name: "#4ECDC4", # Teal | |
| TokenType.BPE.name: "#FFE66D", # Yellow | |
| } | |
| def tokenize_and_display(text): | |
| """ | |
| Tokenizes the input text and prepares it for display in Gradio's HighlightedText component. | |
| """ | |
| if not text: | |
| # Return a structure that matches all outputs to avoid errors | |
| return [], "", "", "" | |
| tokens, _ = tokenizer.tokenize_text(text) | |
| # Create the list of (token, label) for HighlightedText | |
| highlighted_tokens = [] | |
| token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0} | |
| for t in tokens: | |
| token_text = t["token"] | |
| token_type = t["type"].name | |
| # Count token types for statistics | |
| token_stats[token_type] = token_stats.get(token_type, 0) + 1 | |
| highlighted_tokens.append((token_text, token_type)) | |
| encoded_ids = tokenizer.encode(text) | |
| decoded_text = tokenizer.decode(encoded_ids) | |
| # Calculate statistics | |
| total_tokens = len(tokens) | |
| total_chars = len(text) | |
| compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0 | |
| # Define colors for the stats block | |
| bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0') | |
| # Create statistics HTML | |
| stats_html = f""" | |
| <div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;"> | |
| <h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4> | |
| <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;"> | |
| <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:#64748b;font-size:14px;">Characters</div></div> | |
| <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:#64748b;font-size:14px;">Tokens</div></div> | |
| <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:#64748b;font-size:14px;">Compression</div></div> | |
| </div> | |
| <div> | |
| <h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5> | |
| <div style="display:flex;gap:15px;flex-wrap:wrap;"> | |
| <div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div> | |
| <div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div> | |
| <div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div> | |
| </div> | |
| </div> | |
| </div>""" | |
| return highlighted_tokens, str(encoded_ids), decoded_text, stats_html | |
| # Custom CSS for better styling | |
| custom_css = """ | |
| .gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;} | |
| .custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;} | |
| .custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);} | |
| .input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;} | |
| .input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;} | |
| """ | |
| # Create the Gradio Interface | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| gr.Markdown(f""" | |
| # Turkish Tokenizer | |
| ### Advanced Turkish Text Tokenization with Visual Analysis | |
| Enter text to see how it's tokenized. Tokens are color-coded by type. | |
| """) | |
| input_text = gr.Textbox( | |
| label="📝 Input Text", | |
| placeholder="Merhaba Dünya, kitapları okumak güzeldir.", | |
| lines=4, | |
| elem_classes=["input-textbox"] | |
| ) | |
| with gr.Row(): | |
| process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg") | |
| clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg") | |
| gr.Markdown("---") | |
| gr.Markdown("### 🔄 Encoded & Decoded Output") | |
| with gr.Row(): | |
| encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2) | |
| decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2) | |
| gr.Markdown("### 💡 Example Texts") | |
| gr.Examples( | |
| examples=[ | |
| ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."], | |
| ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."], | |
| ["KitapOkumak çok güzeldir ve bilgi verir."], | |
| ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."], | |
| ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."], | |
| ], | |
| inputs=input_text, | |
| label="Try these examples:" | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### 🎨 Tokenization Output") | |
| highlighted_output = gr.HighlightedText( | |
| label="Colorized Tokens", | |
| color_map=color_map, | |
| show_legend=True | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("### 📊 Statistics") | |
| stats_output = gr.HTML(label="") | |
| gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.") | |
| # --- Event Handlers --- | |
| def process_with_theme(text): | |
| return tokenize_and_display(text) | |
| def clear_all(): | |
| return "", [], "", "", "" | |
| # Connect the buttons to the functions | |
| process_button.click( | |
| fn=process_with_theme, | |
| inputs=[input_text], | |
| outputs=[highlighted_output, encoded_output, decoded_output, stats_output] | |
| ) | |
| clear_button.click( | |
| fn=clear_all, | |
| outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output] | |
| ) | |
| # Auto-process on load with a default example | |
| demo.load( | |
| fn=lambda: tokenize_and_display("Merhaba Dünya!"), | |
| outputs=[highlighted_output, encoded_output, decoded_output, stats_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(show_error=True) | |