dsfsi-lid-space

Running

File size: 16,495 Bytes

# coding=utf-8
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
from transformers import pipeline
import fasttext
from huggingface_hub import hf_hub_download
import json
import os
import re
import string
import base64
from typing import List, Tuple, Dict, Optional
import logging

# Configure page
st.set_page_config(
    page_title="South African Language Identification",
    page_icon="🇿🇦",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        text-align: center;
        padding: 1rem 0;
        background: linear-gradient(90deg, #ff6b35, #f7931e);
        color: white;
        border-radius: 10px;
        margin-bottom: 2rem;
    }
    .model-card {
        background: #f8f9fa;
        padding: 1rem;
        border-radius: 8px;
        border-left: 4px solid #ff6b35;
        margin: 1rem 0;
    }
    .result-container {
        background: white;
        padding: 1.5rem;
        border-radius: 10px;
        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
        margin: 1rem 0;
    }
    .metric-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 1rem;
        border-radius: 8px;
        text-align: center;
    }
</style>
""", unsafe_allow_html=True)

# Constants and Configuration
MODEL_CONFIGS = {
    "za-bert": {
        "name": "ZA-BERT",
        "model_id": "dsfsi/za-lid-bert",
        "description": "Lightweight BERT-based model trained on South African languages",
        "recommended": True
    },
    "xlmr-large": {
        "name": "XLM-R Large",
        "model_id": "dsfsi/za-xlmrlarge-lid",
        "description": "XLM-RoBERTa Large model fine-tuned for SA languages"
    },
    "serengeti": {
        "name": "Serengeti",
        "model_id": "dsfsi/za-serengeti-lid",
        "description": "Afri-centric model with superior performance"
    },
    "afriberta": {
        "name": "AfriBERTa",
        "model_id": "dsfsi/za-afriberta-lid",
        "description": "African-focused BERT model"
    },
    "afro-xlmr": {
        "name": "Afro-XLM-R",
        "model_id": "dsfsi/za-afro-xlmr-base-lid",
        "description": "African-centric XLM-RoBERTa model"
    },
    "afrolm": {
        "name": "AfroLM",
        "model_id": "dsfsi/za-afrolm-lid",
        "description": "African language model"
    }
}

# Utility Functions
@st.cache_data
def load_language_names() -> Dict[str, str]:
    """Load language names mapping"""
    try:
        with open("assets/language_names.json", 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        # Fallback mapping for common South African languages
        return {
            "afr": "Afrikaans",
            "eng": "English", 
            "nso": "Northern Sotho",
            "sot": "Sesotho",
            "ssw": "Siswati",
            "tsn": "Setswana",
            "tso": "Xitsonga",
            "ven": "Tshivenda",
            "xho": "isiXhosa",
            "zul": "isiZulu",
            "nbl": "isiNdebele",
            "und": "Undetermined"
        }

@st.cache_resource
def load_model(model_key: str):
    """Load and cache models"""
    try:
        config = MODEL_CONFIGS[model_key]
        model = pipeline("text-classification", model=config["model_id"])
        return model
    except Exception as e:
        st.error(f"Error loading model {model_key}: {str(e)}")
        return None

def preprocess_text(text: str) -> str:
    """Clean and preprocess input text"""
    if not text or not text.strip():
        return ""
    
    # Basic cleaning
    text = text.replace('\n', ' ')
    
    # Remove problematic characters
    replacement_map = {ord(c): ' ' for c in ':•#{|}' + string.digits}
    text = text.translate(replacement_map)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def get_language_name(label: str, lang_names: Dict[str, str]) -> str:
    """Get language name from label"""
    if '_' in label:
        iso_code = label.split('_')[0]
    else:
        iso_code = label
    
    return lang_names.get(iso_code, label)

def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]:
    """Predict language for given text"""
    if not model or not text.strip():
        return "und", 0.0, "Undetermined"
    
    try:
        processed_text = preprocess_text(text)
        if not processed_text:
            return "und", 0.0, "Undetermined"
        
        result = model(processed_text)
        if isinstance(result, list) and len(result) > 0:
            prediction = result[0]
            label = prediction['label']
            confidence = prediction['score']
            language_name = get_language_name(label, lang_names)
            return label, confidence, language_name
        
        return "und", 0.0, "Undetermined"
    
    except Exception as e:
        st.error(f"Prediction error: {str(e)}")
        return "und", 0.0, "Error"

def create_confidence_plot(language: str, confidence: float) -> plt.Figure:
    """Create a confidence visualization"""
    fig, ax = plt.subplots(figsize=(10, 2))
    
    # Colors
    primary_color = "#ff6b35"
    bg_color = "#f8f9fa"
    text_color = "#2c3e50"
    
    # Create horizontal bar
    ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8)
    ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3)
    
    # Styling
    ax.set_xlim(0, 1)
    ax.set_ylim(-0.5, 0.5)
    ax.set_xlabel("Confidence Score", fontsize=12, color=text_color)
    ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})", 
                fontsize=14, fontweight='bold', color=text_color, pad=20)
    
    # Remove y-axis and spines
    ax.set_yticks([])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    # Add confidence text
    ax.text(confidence/2, 0, f"{confidence:.1%}", 
           ha='center', va='center', fontweight='bold', color='white')
    
    plt.tight_layout()
    return fig

def render_paper_info():
    """Render paper information and citation"""
    st.markdown("### 📄 Research Paper")
    
    col1, col2 = st.columns([2, 1])
    
    with col1:
        st.markdown("""
        **"From N-grams to Pre-trained Multilingual Models For Language Identification"**
        
        *Authors: Thapelo Andrew Sindane, Vukosi Marivate*
        
        Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024)
        
        This research investigates N-gram models and large pre-trained multilingual models for Language Identification 
        across 11 South African languages, showing that Serengeti performs best across all model types.
        """)
    
    with col2:
        st.markdown("""
        **Links:**
        - [📖 Paper](https://aclanthology.org/2024.nlp4dh-1.22/)
        - [🤗 HuggingFace](https://huggingface.co/dsfsi)
        - [💻 GitHub](https://github.com/dsfsi/za-lid)
        """)

def render_citation():
    """Render BibTeX citation"""
    citation = """@inproceedings{sindane-marivate-2024-n,
    title = "From N-grams to Pre-trained Multilingual Models For Language Identification",
    author = "Sindane, Thapelo Andrew and Marivate, Vukosi",
    editor = "Hämäläinen, Mika and Öhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri",
    booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
    month = nov,
    year = "2024",
    address = "Miami, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.nlp4dh-1.22/",
    doi = "10.18653/v1/2024.nlp4dh-1.22",
    pages = "229--239"
}"""
    
    st.code(citation, language='bibtex')

def main():
    # Header
    st.markdown("""
    <div class="main-header">
        <h1>🇿🇦 South African Language Identification</h1>
        <p>Multilingual Language Detection for South African Languages</p>
    </div>
    """, unsafe_allow_html=True)
    
    # Load language names
    lang_names = load_language_names()
    
    # Sidebar
    with st.sidebar:
        st.header("⚙️ Model Configuration")
        
        # Model selection
        selected_model = st.selectbox(
            "Choose Model:",
            options=list(MODEL_CONFIGS.keys()),
            format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}",
            index=0,
            help="Select the language identification model"
        )
        
        # Model info
        model_config = MODEL_CONFIGS[selected_model]
        st.markdown(f"""
        <div class="model-card">
            <h4>{model_config['name']}</h4>
            <p>{model_config['description']}</p>
        </div>
        """, unsafe_allow_html=True)
        
        # Supported languages
        st.subheader("📋 Supported Languages")
        supported_langs = [
            "🏴󠁺󠁡󠁺󠁡󠁿 Afrikaans", "🇬🇧 English", "🌍 Northern Sotho", 
            "🌍 Sesotho", "🌍 Siswati", "🌍 Setswana",
            "🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa", 
            "🌍 isiZulu", "🌍 isiNdebele"
        ]
        for lang in supported_langs:
            st.write(f"• {lang}")
    
    # Main content
    tab1, tab2, tab3 = st.tabs(["🔍 Single Text", "📁 Bulk Analysis", "📄 About"])
    
    with tab1:
        st.header("Single Text Analysis")
        
        # Text input
        user_text = st.text_area(
            "Enter text to identify language:",
            placeholder="Type or paste your text here...",
            height=100,
            help="Enter text in any South African language"
        )
        
        col1, col2, col3 = st.columns([1, 1, 2])
        
        with col1:
            analyze_button = st.button("🔍 Analyze", type="primary", use_container_width=True)
        
        with col2:
            clear_button = st.button("🗑️ Clear", use_container_width=True)
            if clear_button:
                st.rerun()
        
        if analyze_button and user_text.strip():
            with st.spinner("Analyzing language..."):
                # Load model
                model = load_model(selected_model)
                
                if model:
                    # Predict
                    label, confidence, language_name = predict_language(user_text, model, lang_names)
                    
                    # Results
                    st.markdown("### 📊 Results")
                    
                    # Metrics
                    col1, col2, col3 = st.columns(3)
                    
                    with col1:
                        st.markdown(f"""
                        <div class="metric-card">
                            <h3>{language_name}</h3>
                            <p>Detected Language</p>
                        </div>
                        """, unsafe_allow_html=True)
                    
                    with col2:
                        st.markdown(f"""
                        <div class="metric-card">
                            <h3>{confidence:.1%}</h3>
                            <p>Confidence</p>
                        </div>
                        """, unsafe_allow_html=True)
                    
                    with col3:
                        st.markdown(f"""
                        <div class="metric-card">
                            <h3>{label}</h3>
                            <p>Language Code</p>
                        </div>
                        """, unsafe_allow_html=True)
                    
                    # Confidence visualization
                    st.markdown("### 📈 Confidence Visualization")
                    fig = create_confidence_plot(language_name, confidence)
                    st.pyplot(fig)
                    
                else:
                    st.error("Failed to load the model. Please try again.")
        
        elif analyze_button:
            st.warning("Please enter some text to analyze.")
    
    with tab2:
        st.header("Bulk Text Analysis")
        
        uploaded_file = st.file_uploader(
            "Upload a text file",
            type=['txt', 'csv'],
            help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column"
        )
        
        if uploaded_file:
            try:
                # Read file
                if uploaded_file.name.endswith('.csv'):
                    df = pd.read_csv(uploaded_file)
                    if 'text' not in df.columns:
                        st.error("CSV file must contain a 'text' column")
                        st.stop()
                    texts = df['text'].astype(str).tolist()
                else:
                    content = uploaded_file.read().decode('utf-8')
                    texts = [line.strip() for line in content.split('\n') if line.strip()]
                
                st.success(f"Loaded {len(texts)} texts for analysis")
                
                if st.button("🚀 Analyze All", type="primary"):
                    model = load_model(selected_model)
                    
                    if model:
                        results = []
                        progress_bar = st.progress(0)
                        
                        for i, text in enumerate(texts):
                            label, confidence, language_name = predict_language(text, model, lang_names)
                            results.append({
                                'Text': text[:100] + '...' if len(text) > 100 else text,
                                'Language': language_name,
                                'Code': label,
                                'Confidence': confidence
                            })
                            progress_bar.progress((i + 1) / len(texts))
                        
                        # Results DataFrame
                        results_df = pd.DataFrame(results)
                        
                        # Display results
                        st.markdown("### 📊 Analysis Results")
                        st.dataframe(results_df, use_container_width=True)
                        
                        # Summary statistics
                        col1, col2 = st.columns(2)
                        
                        with col1:
                            st.markdown("### 📈 Language Distribution")
                            lang_counts = results_df['Language'].value_counts()
                            st.bar_chart(lang_counts)
                        
                        with col2:
                            st.markdown("### 📊 Average Confidence by Language")
                            avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False)
                            st.bar_chart(avg_conf)
                        
                        # Download button
                        csv_data = results_df.to_csv(index=False)
                        st.download_button(
                            label="📥 Download Results (CSV)",
                            data=csv_data,
                            file_name="language_identification_results.csv",
                            mime="text/csv"
                        )
                    
                    else:
                        st.error("Failed to load the model.")
            
            except Exception as e:
                st.error(f"Error processing file: {str(e)}")
    
    with tab3:
        render_paper_info()
        
        st.markdown("---")
        
        st.markdown("### 📖 Citation")
        render_citation()
        
        st.markdown("---")
        
        st.markdown("""
        ### 🏛️ Acknowledgments
        
        This work is part of the Data Science for Social Impact Research Group at the University of Pretoria.
        
        **Contact:**
        - 📧 Email: [email protected]
        - 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM)
        - 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io)
        """)

if __name__ == "__main__":
    main()