# coding=utf-8 import streamlit as st import pandas as pd import matplotlib.pyplot as plt import altair as alt from transformers import pipeline import fasttext from huggingface_hub import hf_hub_download import json import os import re import string import base64 from typing import List, Tuple, Dict, Optional import logging # Configure page st.set_page_config( page_title="South African Language Identification", page_icon="πŸ‡ΏπŸ‡¦", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Constants and Configuration MODEL_CONFIGS = { "za-bert": { "name": "ZA-BERT", "model_id": "dsfsi/za-lid-bert", "description": "Lightweight BERT-based model trained on South African languages", "recommended": True }, "xlmr-large": { "name": "XLM-R Large", "model_id": "dsfsi/za-xlmrlarge-lid", "description": "XLM-RoBERTa Large model fine-tuned for SA languages" }, "serengeti": { "name": "Serengeti", "model_id": "dsfsi/za-serengeti-lid", "description": "Afri-centric model with superior performance" }, "afriberta": { "name": "AfriBERTa", "model_id": "dsfsi/za-afriberta-lid", "description": "African-focused BERT model" }, "afro-xlmr": { "name": "Afro-XLM-R", "model_id": "dsfsi/za-afro-xlmr-base-lid", "description": "African-centric XLM-RoBERTa model" }, "afrolm": { "name": "AfroLM", "model_id": "dsfsi/za-afrolm-lid", "description": "African language model" } } # Utility Functions @st.cache_data def load_language_names() -> Dict[str, str]: """Load language names mapping""" try: with open("assets/language_names.json", 'r') as f: return json.load(f) except FileNotFoundError: # Fallback mapping for common South African languages return { "afr": "Afrikaans", "eng": "English", "nso": "Northern Sotho", "sot": "Sesotho", "ssw": "Siswati", "tsn": "Setswana", "tso": "Xitsonga", "ven": "Tshivenda", "xho": "isiXhosa", "zul": "isiZulu", "nbl": "isiNdebele", "und": "Undetermined" } @st.cache_resource def load_model(model_key: str): """Load and cache models""" try: config = MODEL_CONFIGS[model_key] model = pipeline("text-classification", model=config["model_id"]) return model except Exception as e: st.error(f"Error loading model {model_key}: {str(e)}") return None def preprocess_text(text: str) -> str: """Clean and preprocess input text""" if not text or not text.strip(): return "" # Basic cleaning text = text.replace('\n', ' ') # Remove problematic characters replacement_map = {ord(c): ' ' for c in ':β€’#{|}' + string.digits} text = text.translate(replacement_map) # Normalize whitespace text = re.sub(r'\s+', ' ', text).strip() return text def get_language_name(label: str, lang_names: Dict[str, str]) -> str: """Get language name from label""" if '_' in label: iso_code = label.split('_')[0] else: iso_code = label return lang_names.get(iso_code, label) def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]: """Predict language for given text""" if not model or not text.strip(): return "und", 0.0, "Undetermined" try: processed_text = preprocess_text(text) if not processed_text: return "und", 0.0, "Undetermined" result = model(processed_text) if isinstance(result, list) and len(result) > 0: prediction = result[0] label = prediction['label'] confidence = prediction['score'] language_name = get_language_name(label, lang_names) return label, confidence, language_name return "und", 0.0, "Undetermined" except Exception as e: st.error(f"Prediction error: {str(e)}") return "und", 0.0, "Error" def create_confidence_plot(language: str, confidence: float) -> plt.Figure: """Create a confidence visualization""" fig, ax = plt.subplots(figsize=(10, 2)) # Colors primary_color = "#ff6b35" bg_color = "#f8f9fa" text_color = "#2c3e50" # Create horizontal bar ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8) ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3) # Styling ax.set_xlim(0, 1) ax.set_ylim(-0.5, 0.5) ax.set_xlabel("Confidence Score", fontsize=12, color=text_color) ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})", fontsize=14, fontweight='bold', color=text_color, pad=20) # Remove y-axis and spines ax.set_yticks([]) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['left'].set_visible(False) # Add confidence text ax.text(confidence/2, 0, f"{confidence:.1%}", ha='center', va='center', fontweight='bold', color='white') plt.tight_layout() return fig def render_paper_info(): """Render paper information and citation""" st.markdown("### πŸ“„ Research Paper") col1, col2 = st.columns([2, 1]) with col1: st.markdown(""" **"From N-grams to Pre-trained Multilingual Models For Language Identification"** *Authors: Thapelo Andrew Sindane, Vukosi Marivate* Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024) This research investigates N-gram models and large pre-trained multilingual models for Language Identification across 11 South African languages, showing that Serengeti performs best across all model types. """) with col2: st.markdown(""" **Links:** - [πŸ“– Paper](https://aclanthology.org/2024.nlp4dh-1.22/) - [πŸ€— HuggingFace](https://huggingface.co/dsfsi) - [πŸ’» GitHub](https://github.com/dsfsi/za-lid) """) def render_citation(): """Render BibTeX citation""" citation = """@inproceedings{sindane-marivate-2024-n, title = "From N-grams to Pre-trained Multilingual Models For Language Identification", author = "Sindane, Thapelo Andrew and Marivate, Vukosi", editor = "HΓ€mΓ€lΓ€inen, Mika and Γ–hman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri", booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities", month = nov, year = "2024", address = "Miami, USA", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.nlp4dh-1.22/", doi = "10.18653/v1/2024.nlp4dh-1.22", pages = "229--239" }""" st.code(citation, language='bibtex') def main(): # Header st.markdown("""

πŸ‡ΏπŸ‡¦ South African Language Identification

Multilingual Language Detection for South African Languages

""", unsafe_allow_html=True) # Load language names lang_names = load_language_names() # Sidebar with st.sidebar: st.header("βš™οΈ Model Configuration") # Model selection selected_model = st.selectbox( "Choose Model:", options=list(MODEL_CONFIGS.keys()), format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}", index=0, help="Select the language identification model" ) # Model info model_config = MODEL_CONFIGS[selected_model] st.markdown(f"""

{model_config['name']}

{model_config['description']}

""", unsafe_allow_html=True) # Supported languages st.subheader("πŸ“‹ Supported Languages") supported_langs = [ "🏴󠁺󠁑󠁺󠁑󠁿 Afrikaans", "πŸ‡¬πŸ‡§ English", "🌍 Northern Sotho", "🌍 Sesotho", "🌍 Siswati", "🌍 Setswana", "🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa", "🌍 isiZulu", "🌍 isiNdebele" ] for lang in supported_langs: st.write(f"β€’ {lang}") # Main content tab1, tab2, tab3 = st.tabs(["πŸ” Single Text", "πŸ“ Bulk Analysis", "πŸ“„ About"]) with tab1: st.header("Single Text Analysis") # Text input user_text = st.text_area( "Enter text to identify language:", placeholder="Type or paste your text here...", height=100, help="Enter text in any South African language" ) col1, col2, col3 = st.columns([1, 1, 2]) with col1: analyze_button = st.button("πŸ” Analyze", type="primary", use_container_width=True) with col2: clear_button = st.button("πŸ—‘οΈ Clear", use_container_width=True) if clear_button: st.rerun() if analyze_button and user_text.strip(): with st.spinner("Analyzing language..."): # Load model model = load_model(selected_model) if model: # Predict label, confidence, language_name = predict_language(user_text, model, lang_names) # Results st.markdown("### πŸ“Š Results") # Metrics col1, col2, col3 = st.columns(3) with col1: st.markdown(f"""

{language_name}

Detected Language

""", unsafe_allow_html=True) with col2: st.markdown(f"""

{confidence:.1%}

Confidence

""", unsafe_allow_html=True) with col3: st.markdown(f"""

{label}

Language Code

""", unsafe_allow_html=True) # Confidence visualization st.markdown("### πŸ“ˆ Confidence Visualization") fig = create_confidence_plot(language_name, confidence) st.pyplot(fig) else: st.error("Failed to load the model. Please try again.") elif analyze_button: st.warning("Please enter some text to analyze.") with tab2: st.header("Bulk Text Analysis") uploaded_file = st.file_uploader( "Upload a text file", type=['txt', 'csv'], help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column" ) if uploaded_file: try: # Read file if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) if 'text' not in df.columns: st.error("CSV file must contain a 'text' column") st.stop() texts = df['text'].astype(str).tolist() else: content = uploaded_file.read().decode('utf-8') texts = [line.strip() for line in content.split('\n') if line.strip()] st.success(f"Loaded {len(texts)} texts for analysis") if st.button("πŸš€ Analyze All", type="primary"): model = load_model(selected_model) if model: results = [] progress_bar = st.progress(0) for i, text in enumerate(texts): label, confidence, language_name = predict_language(text, model, lang_names) results.append({ 'Text': text[:100] + '...' if len(text) > 100 else text, 'Language': language_name, 'Code': label, 'Confidence': confidence }) progress_bar.progress((i + 1) / len(texts)) # Results DataFrame results_df = pd.DataFrame(results) # Display results st.markdown("### πŸ“Š Analysis Results") st.dataframe(results_df, use_container_width=True) # Summary statistics col1, col2 = st.columns(2) with col1: st.markdown("### πŸ“ˆ Language Distribution") lang_counts = results_df['Language'].value_counts() st.bar_chart(lang_counts) with col2: st.markdown("### πŸ“Š Average Confidence by Language") avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False) st.bar_chart(avg_conf) # Download button csv_data = results_df.to_csv(index=False) st.download_button( label="πŸ“₯ Download Results (CSV)", data=csv_data, file_name="language_identification_results.csv", mime="text/csv" ) else: st.error("Failed to load the model.") except Exception as e: st.error(f"Error processing file: {str(e)}") with tab3: render_paper_info() st.markdown("---") st.markdown("### πŸ“– Citation") render_citation() st.markdown("---") st.markdown(""" ### πŸ›οΈ Acknowledgments This work is part of the Data Science for Social Impact Research Group at the University of Pretoria. **Contact:** - πŸ“§ Email: vukosi.marivate@cs.up.ac.za - 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM) - 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io) """) if __name__ == "__main__": main()