dsfsi-lid-space

Running

App Files Files Community

vukosi commited on 4 days ago

Commit

590ee2f

verified ·

1 Parent(s): 9ac1327

Update app.py

Browse files

Files changed (1) hide show

app.py +440 -443

app.py CHANGED Viewed

@@ -1,477 +1,474 @@
 # coding=utf-8
-# Copyright 2023 The GlotLID Authors.
-# Lint as: python3
-# This space is built based on AMR-KELEG/ALDi space.
-# GlotLID Space
-import string
-import constants
-import pandas as pd
 import streamlit as st
-from huggingface_hub import hf_hub_download
-from GlotScript import get_script_predictor
-import matplotlib
-from matplotlib import pyplot as plt
-import fasttext
 import altair as alt
-from altair import X, Y, Scale
-import base64
 import json
 import os
 import re
-import transformers
-from transformers import pipeline
-@st.cache_resource
-def load_sp():
-    sp = get_script_predictor()
-    return sp
-sp = load_sp()
-def get_script(text):
-    """Get the writing systems of given text.
-    Args:
-        text: The text to be preprocessed.
-    Returns:
-        The main script and list of all scripts.
-    """
-    res = sp(text)
-    main_script = res[0] if res[0] else 'Zyyy'
-    all_scripts_dict = res[2]['details']
-    if all_scripts_dict:
-        all_scripts = list(all_scripts_dict.keys())
-    else:
-        all_scripts = 'Zyyy'
-    for ws in all_scripts:
-        if ws in ['Kana', 'Hrkt', 'Hani', 'Hira']:
-            all_scripts.append('Jpan')
-    all_scripts = list(set(all_scripts))
-    return main_script, all_scripts
-def preprocess_text(text):
-    """Apply preprocessing to the given text.
-    Args:
-        text: Thetext to be preprocessed.
-    Returns:
-        The preprocessed text.
-    """
-    # remove \n
-    text = text.replace('\n', ' ')
-    # get rid of characters that are ubiquitous
-    replace_by = " "
-    replacement_map = {
-        ord(c): replace_by
-        for c in ':•#{|}' + string.digits
     }
-    text = text.translate(replacement_map)
-    # make multiple space one space
-    text = re.sub(r'\s+', ' ', text)
-    # strip the text
-    text = text.strip()
-    return text
-@st.cache_data
-def language_names(json_path):
-    with open(json_path, 'r') as json_file:
-        data = json.load(json_file)
-    return data
-label2name = language_names("assets/language_names.json")
-def get_name(label):
-    """Get the name of language from label"""
-    iso_3 = label.split('_')[0]
-    name = label2name[iso_3]
-    return name
-@st.cache_data
-def render_svg(svg):
-    """Renders the given svg string."""
-    b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
-    html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}", width="40%"/></p>'
-    c = st.container()
-    c.write(html, unsafe_allow_html=True)
-@st.cache_data
-def render_metadata():
-    """Renders the metadata."""
-    html = r"""<p align="center">
-        <a href="https://huggingface.co/dsfsi/za-lid"><img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-8A2BE2"></a>
-        <a href="https://github.com/dsfsi/za-lid"><img alt="GitHub" src="https://img.shields.io/badge/%F0%9F%93%A6%20GitHub-orange"></a>
-        <a href="https://github.com/dsfsi/za-lid/blob/master/LICENSE.md"><img alt="GitHub license" src="https://img.shields.io/badge/Github%20Licence-blue"></a>
-        <a href="https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform" target="_blank"><img alt="Feedback Form" src="https://img.shields.io/badge/Feedback-Form-brightgreen"></a>
-        <a href="https://arxiv.org/abs/2410.08728" target="_blank"><img alt="arxiv" src="https://img.shields.io/badge/arxiv-2410.08728-blue"></a></p>"""
-    c = st.container()
-    c.write(html, unsafe_allow_html=True)
-@st.cache_data
-def citation():
-    """Renders the metadata."""
-    _CITATION  = """
-    @inproceedings{
-      kargaran2023glotlid,
-      title={GlotLID: Language Identification for Low-Resource Languages},
-      author={Kargaran, Amir Hossein and Imani, Ayyoob and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich},
-      booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
-      year={2023},
-      url={https://openreview.net/forum?id=dl4e3EBz5j}
-    }"""
-    st.code(_CITATION, language="python", line_numbers=False)
 @st.cache_data
-def convert_df(df):
-    # IMPORTANT: Cache the conversion to prevent computation on every rerun
-    return df.to_csv(index=None).encode("utf-8")
 @st.cache_resource
-def load_model(model_name, file_name):
-    model_path = hf_hub_download(repo_id=model_name, filename=file_name)
-    model = fasttext.load_model(model_path)
-    return model
-@st.cache_resource
-def load_model_pipeline(model_name, file_name):
-    model = pipeline("text-classification", model=model_name)
-    return model
-# model_1 = load_model(constants.MODEL_NAME, "model_v1.bin")
-# model_2 = load_model(constants.MODEL_NAME, "model_v2.bin")
-# model_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
-# openlid = load_model('laurievb/OpenLID', "model.bin")
-# nllb = load_model('facebook/fasttext-language-identification', "model.bin")
-# MODELS
-model_xlmr_large = load_model_pipeline('dsfsi/za-xlmrlarge-lid', "model.bin")
-model_serengeti = load_model_pipeline('dsfsi/za-serengeti-lid', "model.bin")
-model_afriberta = load_model_pipeline('dsfsi/za-afriberta-lid', "model.bin")
-model_afroxlmr_base = load_model_pipeline('dsfsi/za-afro-xlmr-base-lid', "model.bin")
-model_afrolm        = load_model_pipeline('dsfsi/za-afrolm-lid', "model.bin")
-za_lid = load_model_pipeline('dsfsi/za-lid-bert', "model.bin")
-openlid = load_model('laurievb/OpenLID', "model.bin")
-glotlid_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
-# @st.cache_resource
-def plot(label, prob):
-    ORANGE_COLOR = "#FF8000"
-    BLACK_COLOR = "#31333F"
-    fig, ax = plt.subplots(figsize=(8, 1))
-    fig.patch.set_facecolor("none")
-    ax.set_facecolor("none")
-    ax.spines["left"].set_color(BLACK_COLOR)
-    ax.spines["bottom"].set_color(BLACK_COLOR)
-    ax.tick_params(axis="x", colors=BLACK_COLOR)
-    ax.spines[["right", "top"]].set_visible(False)
-    ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
-    ax.set_xlim(0, 1)
-    ax.set_ylim(-1, 1)
-    ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=BLACK_COLOR)
-    ax.get_yaxis().set_visible(False)
-    ax.set_xlabel("Confidence", color=BLACK_COLOR)
-    st.pyplot(fig)
-# @st.cache_resource
-def plot_multiples(models, labels, probs):
-    ORANGE_COLOR = "#FF8000"
-    BLACK_COLOR = "#31333F"
-    fig, ax = plt.subplots(figsize=(12, len(models)))
-    fig.patch.set_facecolor("none")
-    ax.set_facecolor("none")
-    ax.spines["left"].set_color(BLACK_COLOR)
-    ax.spines["bottom"].set_color(BLACK_COLOR)
-    ax.tick_params(axis="x", colors=BLACK_COLOR)
-    ax.spines[["right", "top"]].set_visible(False)
-    # Plot bars for each model, label, and probability
-    y_positions = range(len(models))  # Y positions for each model
-    ax.barh(y=y_positions, width=probs, color=ORANGE_COLOR)
-    # Add labels next to each bar
-    for i, (prob, label) in enumerate(zip(probs, labels)):
-        ax.text(prob + 0.01, i, f"{label} ({prob:.2f})", va='center', color=BLACK_COLOR)
-    # Set y-ticks and labels
-    ax.set_yticks(y_positions)
-    ax.set_yticklabels(models, color=BLACK_COLOR)
     ax.set_xlim(0, 1)
-    ax.set_xlabel("Confidence", color=BLACK_COLOR)
-    ax.set_title("Model Predictions", color=BLACK_COLOR)
-    st.pyplot(fig)
-def compute(sentences, version = 'v3'):
-    """Computes the language probablities and labels for the given sentences.
-    Args:
-        sentences: A list of sentences.
-    Returns:
-        A list of language probablities and labels for the given sentences.
-    """
-    progress_text = "Computing Language..."
-    if version == 'xlmrlarge':
-        model_choice = model_xlmr_large
-    elif version == 'serengeti':
-        model_choice = model_serengeti
-    elif version == 'afriberta':
-        model_choice = model_afriberta
-    elif version == 'afroxlmrbase':
-        model_choice = model_afroxlmr_base
-    elif version=='afrolm':
-        model_choice = model_afrolm
-    elif version == 'BERT':
-        model_choice = za_lid
-    elif version == 'openlid-201':
-        model_choice = openlid
-    elif version == 'GlotLID v3':
-          model_choice = glotlid_3
-    else:
-         model_choice = [(model_xlmr_large, "xlmrlarge"),(model_serengeti,"serengeti"), (model_afriberta,"afriberta"), (model_afroxlmr_base,"afroxlmrbase"), (model_afrolm,"afrolm"), (za_lid,"BERT"), (openlid,"openlid-201"),  (glotlid_3,"GlotLID v3")]
-    my_bar = st.progress(0, text=progress_text)
-    probs = []
-    labels = []
-    sentences = [preprocess_text(sent) for sent in sentences]
-    for index, sent in enumerate(sentences):
-        if type(model_choice) == list:
-                 all_models_pred = []
-                 for model_version in model_choice:
-                            m_version = model_version[1]
-                            model     = model_version[0]
-                            if m_version not in  ["openlid-201", "GlotLID v3"]:
-                                    output = model.predict(sent)
-                                    output_label = output[index]['label']
-                                    output_prob =  output[index]['score']
-                                    output_label_language = output[index]['label']
-                                    labels = labels + [output_label]
-                                    probs = probs + [output_prob]
-                                    my_bar.progress(
-                                        min((index) / len(sentences), 1),
-                                        text=progress_text,
-                                    )
-                            else:
-                                    output = model.predict(sent)
-                                    output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
-                                    output_prob = max(min(output[1][0], 1), 0)
-                                    output_label_language = output_label.split('_')[0]
-                                    # script control
-                                    if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
-                                        main_script, all_scripts = get_script(sent)
-                                        output_label_script = output_label.split('_')[1]
-                                        if output_label_script not in all_scripts:
-                                            output_label_script = main_script
-                                            output_label = f"und_{output_label_script}"
-                                            output_prob = 0
-                                    labels = labels + [output_label]
-                                    probs = probs + [output_prob]
-                                    my_bar.progress(
-                                        min((index) / len(sentences), 1),
-                                        text=progress_text,
-                                    )
-        else:
-                if version not in ["openlid-201", "GlotLID v3"]:
-                        output = model_choice.predict(sent)
-                        output_label = output[index]['label']
-                        output_prob =  output[index]['score']
-                        output_label_language = output[index]['label']
-                        labels = labels + [output_label]
-                        probs = probs + [output_prob]
-                        my_bar.progress(
-                                min((index) / len(sentences), 1),
-                                text=progress_text,
-                            )
-                else:
-                            output = model_choice.predict(sent)
-                            output_label  = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
-                            output_prob = max(min(output[1][0], 1), 0)
-                            output_label_language = output_label.split('_')[0]
-                            # script control
-                            if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
-                                main_script, all_scripts = get_script(sent)
-                                output_label_script = output_label.split('_')[1]
-                                if output_label_script not in all_scripts:
-                                    output_label_script = main_script
-                                    output_label = f"und_{output_label_script}"
-                                    output_prob = 0
-                            labels = labels + [output_label]
-                            probs = probs + [output_prob]
-                            my_bar.progress(
-                                min((index) / len(sentences), 1),
-                                text=progress_text,
-                )
-    my_bar.empty()
-    return probs, labels
-#  st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")
-#  render_svg(open("assets/glotlid_logo.svg").read())
-render_metadata()
-img1, img2, img3 = st.columns(3)
-with img2:
-    with st.container():
-        st.image("logo_transparent_small.png")
-st.markdown("**DSFSI** Language Identification (LID) Inference Endpoint Created with **HuggingFace Spaces**.")
-with st.expander("More information about the space"):
-    st.write('''
-        Authors: Thapelo Sindane, Vukosi Marivate
-    ''')
-tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
-with tab1:
-    # choice = st.radio(
-    #     "Set granularity level",
-    #     ["default", "merge", "individual"],
-    #     captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
-    # )
-    version = st.radio(
-        "Choose model",
-        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "openlid-201", "GlotLID v3", "All-Models"],
-        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3",'All-Models'],
-        index = 4,
-        key = 'version_tab1',
-        horizontal = True
-    )
-    sent = st.text_input(
-        "Sentence:", placeholder="Enter a sentence.", on_change=None
-    )
-    # TODO: Check if this is needed!
-    clicked = st.button("Submit")
-    if sent:
-        probs, labels = compute([sent], version=version)
-        prob = probs[0]
-        label = labels[0]
-        # Check if the file exists
-        if not os.path.exists('logs.txt'):
-            with open('logs.txt', 'w') as file:
-                pass
-        print(f"{sent}, {label}: {prob}")
-        with open("logs.txt", "a") as f:
-            f.write(f"{sent}, {label}: {prob}\n")
-        # plot
-        if version == "All-Models":
-               plot_multiples(["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "OpenLID", "GlotLID v3"], labels, probs)
-        else:
-               plot(label, prob)
-with tab2:
-    version = st.radio(
-        "Choose model",
-        ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT","openlid-201", "GlotLID v3", "All-Models"],
-        captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3", "All-Models"],
-        index = 4,
-        key = 'version_tab2',
-        horizontal = True
-    )
-    file = st.file_uploader("Upload a file", type=["txt"])
-    if file is not None:
-        df = pd.read_csv(file, sep="¦\t¦", header=None, engine='python')
-        df.columns = ["Sentence"]
-        df.reset_index(drop=True, inplace=True)
-        # TODO: Run the model
-        df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
-        df['Language'] = df["Label"].apply(get_name)
-        # A horizontal rule
-        st.markdown("""---""")
-        chart = (
-            alt.Chart(df.reset_index())
-            .mark_area(color="darkorange", opacity=0.5)
-            .encode(
-                x=X(field="index", title="Sentence Index"),
-                y=Y("Prob", scale=Scale(domain=[0, 1])),
-            )
-        )
-        st.altair_chart(chart.interactive(), use_container_width=True)
-        col1, col2 = st.columns([4, 1])
-        with col1:
-            # Display the output
-            st.table(
-                df,
-            )
-        with col2:
-            # Add a download button
-            csv = convert_df(df)
-            st.download_button(
-                label=":file_folder: Download predictions as CSV",
-                data=csv,
-                file_name="GlotLID.csv",
-                mime="text/csv",
-            )
-# citation()

 # coding=utf-8
 import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
 import altair as alt
+from transformers import pipeline
+import fasttext
+from huggingface_hub import hf_hub_download
 import json
 import os
 import re
+import string
+import base64
+from typing import List, Tuple, Dict, Optional
+import logging
+# Configure page
+st.set_page_config(
+    page_title="South African Language Identification",
+    page_icon="🇿🇦",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    .main-header {
+        text-align: center;
+        padding: 1rem 0;
+        background: linear-gradient(90deg, #ff6b35, #f7931e);
+        color: white;
+        border-radius: 10px;
+        margin-bottom: 2rem;
     }
+    .model-card {
+        background: #f8f9fa;
+        padding: 1rem;
+        border-radius: 8px;
+        border-left: 4px solid #ff6b35;
+        margin: 1rem 0;
+    }
+    .result-container {
+        background: white;
+        padding: 1.5rem;
+        border-radius: 10px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+        margin: 1rem 0;
+    }
+    .metric-card {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 8px;
+        text-align: center;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Constants and Configuration
+MODEL_CONFIGS = {
+    "za-bert": {
+        "name": "ZA-BERT",
+        "model_id": "dsfsi/za-lid-bert",
+        "description": "Lightweight BERT-based model trained on South African languages",
+        "recommended": True
+    },
+    "xlmr-large": {
+        "name": "XLM-R Large",
+        "model_id": "dsfsi/za-xlmrlarge-lid",
+        "description": "XLM-RoBERTa Large model fine-tuned for SA languages"
+    },
+    "serengeti": {
+        "name": "Serengeti",
+        "model_id": "dsfsi/za-serengeti-lid",
+        "description": "Afri-centric model with superior performance"
+    },
+    "afriberta": {
+        "name": "AfriBERTa",
+        "model_id": "dsfsi/za-afriberta-lid",
+        "description": "African-focused BERT model"
+    },
+    "afro-xlmr": {
+        "name": "Afro-XLM-R",
+        "model_id": "dsfsi/za-afro-xlmr-base-lid",
+        "description": "African-centric XLM-RoBERTa model"
+    },
+    "afrolm": {
+        "name": "AfroLM",
+        "model_id": "dsfsi/za-afrolm-lid",
+        "description": "African language model"
+    }
+}
+# Utility Functions
 @st.cache_data
+def load_language_names() -> Dict[str, str]:
+    """Load language names mapping"""
+    try:
+        with open("assets/language_names.json", 'r') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        # Fallback mapping for common South African languages
+        return {
+            "afr": "Afrikaans",
+            "eng": "English",
+            "nso": "Northern Sotho",
+            "sot": "Sesotho",
+            "ssw": "Siswati",
+            "tsn": "Setswana",
+            "tso": "Xitsonga",
+            "ven": "Tshivenda",
+            "xho": "isiXhosa",
+            "zul": "isiZulu",
+            "nbl": "isiNdebele",
+            "und": "Undetermined"
+        }
 @st.cache_resource
+def load_model(model_key: str):
+    """Load and cache models"""
+    try:
+        config = MODEL_CONFIGS[model_key]
+        model = pipeline("text-classification", model=config["model_id"])
+        return model
+    except Exception as e:
+        st.error(f"Error loading model {model_key}: {str(e)}")
+        return None
+def preprocess_text(text: str) -> str:
+    """Clean and preprocess input text"""
+    if not text or not text.strip():
+        return ""
+    # Basic cleaning
+    text = text.replace('\n', ' ')
+    # Remove problematic characters
+    replacement_map = {ord(c): ' ' for c in ':•#{|}' + string.digits}
+    text = text.translate(replacement_map)
+    # Normalize whitespace
+    text = re.sub(r'\s+', ' ', text).strip()
+    return text
+def get_language_name(label: str, lang_names: Dict[str, str]) -> str:
+    """Get language name from label"""
+    if '_' in label:
+        iso_code = label.split('_')[0]
+    else:
+        iso_code = label
+    return lang_names.get(iso_code, label)
+def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]:
+    """Predict language for given text"""
+    if not model or not text.strip():
+        return "und", 0.0, "Undetermined"
+    try:
+        processed_text = preprocess_text(text)
+        if not processed_text:
+            return "und", 0.0, "Undetermined"
+        result = model(processed_text)
+        if isinstance(result, list) and len(result) > 0:
+            prediction = result[0]
+            label = prediction['label']
+            confidence = prediction['score']
+            language_name = get_language_name(label, lang_names)
+            return label, confidence, language_name
+        return "und", 0.0, "Undetermined"
+    except Exception as e:
+        st.error(f"Prediction error: {str(e)}")
+        return "und", 0.0, "Error"
+def create_confidence_plot(language: str, confidence: float) -> plt.Figure:
+    """Create a confidence visualization"""
+    fig, ax = plt.subplots(figsize=(10, 2))
+    # Colors
+    primary_color = "#ff6b35"
+    bg_color = "#f8f9fa"
+    text_color = "#2c3e50"
+    # Create horizontal bar
+    ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8)
+    ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3)
+    # Styling
     ax.set_xlim(0, 1)
+    ax.set_ylim(-0.5, 0.5)
+    ax.set_xlabel("Confidence Score", fontsize=12, color=text_color)
+    ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})",
+                fontsize=14, fontweight='bold', color=text_color, pad=20)
+    # Remove y-axis and spines
+    ax.set_yticks([])
+    ax.spines['top'].set_visible(False)
+    ax.spines['right'].set_visible(False)
+    ax.spines['left'].set_visible(False)
+    # Add confidence text
+    ax.text(confidence/2, 0, f"{confidence:.1%}",
+           ha='center', va='center', fontweight='bold', color='white')
+    plt.tight_layout()
+    return fig
+def render_paper_info():
+    """Render paper information and citation"""
+    st.markdown("### 📄 Research Paper")
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("""
+        **"From N-grams to Pre-trained Multilingual Models For Language Identification"**
+        *Authors: Thapelo Andrew Sindane, Vukosi Marivate*
+        Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024)
+        This research investigates N-gram models and large pre-trained multilingual models for Language Identification
+        across 11 South African languages, showing that Serengeti performs best across all model types.
+        """)
+    with col2:
+        st.markdown("""
+        **Links:**
+        - [📖 Paper](https://aclanthology.org/2024.nlp4dh-1.22/)
+        - [🤗 HuggingFace](https://huggingface.co/dsfsi)
+        - [💻 GitHub](https://github.com/dsfsi/za-lid)
+        """)
+def render_citation():
+    """Render BibTeX citation"""
+    citation = """@inproceedings{sindane-marivate-2024-n,
+    title = "From N-grams to Pre-trained Multilingual Models For Language Identification",
+    author = "Sindane, Thapelo Andrew and Marivate, Vukosi",
+    editor = "Hämäläinen, Mika and Öhman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri",
+    booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
+    month = nov,
+    year = "2024",
+    address = "Miami, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.nlp4dh-1.22/",
+    doi = "10.18653/v1/2024.nlp4dh-1.22",
+    pages = "229--239"
+}"""
+    st.code(citation, language='bibtex')
+def main():
+    # Header
+    st.markdown("""
+    <div class="main-header">
+        <h1>🇿🇦 South African Language Identification</h1>
+        <p>Multilingual Language Detection for South African Languages</p>
+    </div>
+    """, unsafe_allow_html=True)
+    # Load language names
+    lang_names = load_language_names()
+    # Sidebar
+    with st.sidebar:
+        st.header("⚙️ Model Configuration")
+        # Model selection
+        selected_model = st.selectbox(
+            "Choose Model:",
+            options=list(MODEL_CONFIGS.keys()),
+            format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}",
+            index=0,
+            help="Select the language identification model"
+        )
+        # Model info
+        model_config = MODEL_CONFIGS[selected_model]
+        st.markdown(f"""
+        <div class="model-card">
+            <h4>{model_config['name']}</h4>
+            <p>{model_config['description']}</p>
+        </div>
+        """, unsafe_allow_html=True)
+        # Supported languages
+        st.subheader("📋 Supported Languages")
+        supported_langs = [
+            "🏴󠁺󠁡󠁺󠁡󠁿 Afrikaans", "🇬🇧 English", "🌍 Northern Sotho",
+            "🌍 Sesotho", "🌍 Siswati", "🌍 Setswana",
+            "🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa",
+            "🌍 isiZulu", "🌍 isiNdebele"
+        ]
+        for lang in supported_langs:
+            st.write(f"• {lang}")
+    # Main content
+    tab1, tab2, tab3 = st.tabs(["🔍 Single Text", "📁 Bulk Analysis", "📄 About"])
+    with tab1:
+        st.header("Single Text Analysis")
+        # Text input
+        user_text = st.text_area(
+            "Enter text to identify language:",
+            placeholder="Type or paste your text here...",
+            height=100,
+            help="Enter text in any South African language"
+        )
+        col1, col2, col3 = st.columns([1, 1, 2])
+        with col1:
+            analyze_button = st.button("🔍 Analyze", type="primary", use_container_width=True)
+        with col2:
+            clear_button = st.button("🗑️ Clear", use_container_width=True)
+            if clear_button:
+                st.rerun()
+        if analyze_button and user_text.strip():
+            with st.spinner("Analyzing language..."):
+                # Load model
+                model = load_model(selected_model)
+                if model:
+                    # Predict
+                    label, confidence, language_name = predict_language(user_text, model, lang_names)
+                    # Results
+                    st.markdown("### 📊 Results")
+                    # Metrics
+                    col1, col2, col3 = st.columns(3)
+                    with col1:
+                        st.markdown(f"""
+                        <div class="metric-card">
+                            <h3>{language_name}</h3>
+                            <p>Detected Language</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                    with col2:
+                        st.markdown(f"""
+                        <div class="metric-card">
+                            <h3>{confidence:.1%}</h3>
+                            <p>Confidence</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                    with col3:
+                        st.markdown(f"""
+                        <div class="metric-card">
+                            <h3>{label}</h3>
+                            <p>Language Code</p>
+                        </div>
+                        """, unsafe_allow_html=True)
+                    # Confidence visualization
+                    st.markdown("### 📈 Confidence Visualization")
+                    fig = create_confidence_plot(language_name, confidence)
+                    st.pyplot(fig)
+                else:
+                    st.error("Failed to load the model. Please try again.")
+        elif analyze_button:
+            st.warning("Please enter some text to analyze.")
+    with tab2:
+        st.header("Bulk Text Analysis")
+        uploaded_file = st.file_uploader(
+            "Upload a text file",
+            type=['txt', 'csv'],
+            help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column"
+        )
+        if uploaded_file:
+            try:
+                # Read file
+                if uploaded_file.name.endswith('.csv'):
+                    df = pd.read_csv(uploaded_file)
+                    if 'text' not in df.columns:
+                        st.error("CSV file must contain a 'text' column")
+                        st.stop()
+                    texts = df['text'].astype(str).tolist()
+                else:
+                    content = uploaded_file.read().decode('utf-8')
+                    texts = [line.strip() for line in content.split('\n') if line.strip()]
+                st.success(f"Loaded {len(texts)} texts for analysis")
+                if st.button("🚀 Analyze All", type="primary"):
+                    model = load_model(selected_model)
+                    if model:
+                        results = []
+                        progress_bar = st.progress(0)
+                        for i, text in enumerate(texts):
+                            label, confidence, language_name = predict_language(text, model, lang_names)
+                            results.append({
+                                'Text': text[:100] + '...' if len(text) > 100 else text,
+                                'Language': language_name,
+                                'Code': label,
+                                'Confidence': confidence
+                            })
+                            progress_bar.progress((i + 1) / len(texts))
+                        # Results DataFrame
+                        results_df = pd.DataFrame(results)
+                        # Display results
+                        st.markdown("### 📊 Analysis Results")
+                        st.dataframe(results_df, use_container_width=True)
+                        # Summary statistics
+                        col1, col2 = st.columns(2)
+                        with col1:
+                            st.markdown("### 📈 Language Distribution")
+                            lang_counts = results_df['Language'].value_counts()
+                            st.bar_chart(lang_counts)
+                        with col2:
+                            st.markdown("### 📊 Average Confidence by Language")
+                            avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False)
+                            st.bar_chart(avg_conf)
+                        # Download button
+                        csv_data = results_df.to_csv(index=False)
+                        st.download_button(
+                            label="📥 Download Results (CSV)",
+                            data=csv_data,
+                            file_name="language_identification_results.csv",
+                            mime="text/csv"
+                        )
+                    else:
+                        st.error("Failed to load the model.")
+            except Exception as e:
+                st.error(f"Error processing file: {str(e)}")
+    with tab3:
+        render_paper_info()
+        st.markdown("---")
+        st.markdown("### 📖 Citation")
+        render_citation()
+        st.markdown("---")
+        st.markdown("""
+        ### 🏛️ Acknowledgments
+        This work is part of the Data Science for Social Impact Research Group at the University of Pretoria.
+        **Contact:**
+        - 📧 Email: [email protected].za
+        - 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM)
+        - 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io)
+        """)
+if __name__ == "__main__":
+    main()