vukosi commited on
Commit
590ee2f
Β·
verified Β·
1 Parent(s): 9ac1327

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +440 -443
app.py CHANGED
@@ -1,477 +1,474 @@
1
  # coding=utf-8
2
- # Copyright 2023 The GlotLID Authors.
3
- # Lint as: python3
4
-
5
-
6
- # This space is built based on AMR-KELEG/ALDi space.
7
- # GlotLID Space
8
-
9
- import string
10
- import constants
11
- import pandas as pd
12
  import streamlit as st
13
- from huggingface_hub import hf_hub_download
14
- from GlotScript import get_script_predictor
15
- import matplotlib
16
- from matplotlib import pyplot as plt
17
- import fasttext
18
  import altair as alt
19
- from altair import X, Y, Scale
20
- import base64
 
21
  import json
22
  import os
23
  import re
24
- import transformers
25
- from transformers import pipeline
26
-
27
- @st.cache_resource
28
- def load_sp():
29
- sp = get_script_predictor()
30
- return sp
31
-
32
-
33
- sp = load_sp()
34
-
35
- def get_script(text):
36
- """Get the writing systems of given text.
37
-
38
- Args:
39
- text: The text to be preprocessed.
40
-
41
- Returns:
42
- The main script and list of all scripts.
43
- """
44
- res = sp(text)
45
- main_script = res[0] if res[0] else 'Zyyy'
46
- all_scripts_dict = res[2]['details']
47
- if all_scripts_dict:
48
- all_scripts = list(all_scripts_dict.keys())
49
- else:
50
- all_scripts = 'Zyyy'
51
-
52
- for ws in all_scripts:
53
- if ws in ['Kana', 'Hrkt', 'Hani', 'Hira']:
54
- all_scripts.append('Jpan')
55
-
56
- all_scripts = list(set(all_scripts))
57
- return main_script, all_scripts
58
-
59
-
60
- def preprocess_text(text):
61
- """Apply preprocessing to the given text.
62
- Args:
63
- text: Thetext to be preprocessed.
64
- Returns:
65
- The preprocessed text.
66
- """
67
-
68
- # remove \n
69
- text = text.replace('\n', ' ')
70
-
71
- # get rid of characters that are ubiquitous
72
- replace_by = " "
73
- replacement_map = {
74
- ord(c): replace_by
75
- for c in ':β€’#{|}' + string.digits
76
  }
77
- text = text.translate(replacement_map)
78
-
79
- # make multiple space one space
80
- text = re.sub(r'\s+', ' ', text)
81
-
82
- # strip the text
83
- text = text.strip()
84
-
85
- return text
86
-
87
-
88
- @st.cache_data
89
- def language_names(json_path):
90
- with open(json_path, 'r') as json_file:
91
- data = json.load(json_file)
92
- return data
93
-
94
- label2name = language_names("assets/language_names.json")
95
-
96
- def get_name(label):
97
- """Get the name of language from label"""
98
- iso_3 = label.split('_')[0]
99
- name = label2name[iso_3]
100
- return name
101
-
102
-
103
- @st.cache_data
104
- def render_svg(svg):
105
- """Renders the given svg string."""
106
- b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
107
- html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}", width="40%"/></p>'
108
- c = st.container()
109
- c.write(html, unsafe_allow_html=True)
110
-
111
-
112
- @st.cache_data
113
- def render_metadata():
114
- """Renders the metadata."""
115
- html = r"""<p align="center">
116
- <a href="https://huggingface.co/dsfsi/za-lid"><img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-8A2BE2"></a>
117
- <a href="https://github.com/dsfsi/za-lid"><img alt="GitHub" src="https://img.shields.io/badge/%F0%9F%93%A6%20GitHub-orange"></a>
118
- <a href="https://github.com/dsfsi/za-lid/blob/master/LICENSE.md"><img alt="GitHub license" src="https://img.shields.io/badge/Github%20Licence-blue"></a>
119
- <a href="https://docs.google.com/forms/d/e/1FAIpQLSf7S36dyAUPx2egmXbFpnTBuzoRulhL5Elu-N1eoMhaO7v10w/viewform" target="_blank"><img alt="Feedback Form" src="https://img.shields.io/badge/Feedback-Form-brightgreen"></a>
120
- <a href="https://arxiv.org/abs/2410.08728" target="_blank"><img alt="arxiv" src="https://img.shields.io/badge/arxiv-2410.08728-blue"></a></p>"""
121
- c = st.container()
122
- c.write(html, unsafe_allow_html=True)
123
-
124
- @st.cache_data
125
- def citation():
126
- """Renders the metadata."""
127
- _CITATION = """
128
- @inproceedings{
129
- kargaran2023glotlid,
130
- title={GlotLID: Language Identification for Low-Resource Languages},
131
- author={Kargaran, Amir Hossein and Imani, Ayyoob and Yvon, Fran{\c{c}}ois and Sch{\"u}tze, Hinrich},
132
- booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
133
- year={2023},
134
- url={https://openreview.net/forum?id=dl4e3EBz5j}
135
- }"""
136
- st.code(_CITATION, language="python", line_numbers=False)
137
-
138
 
 
139
  @st.cache_data
140
- def convert_df(df):
141
- # IMPORTANT: Cache the conversion to prevent computation on every rerun
142
- return df.to_csv(index=None).encode("utf-8")
143
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  @st.cache_resource
146
- def load_model(model_name, file_name):
147
- model_path = hf_hub_download(repo_id=model_name, filename=file_name)
148
- model = fasttext.load_model(model_path)
149
- return model
150
-
151
- @st.cache_resource
152
- def load_model_pipeline(model_name, file_name):
153
- model = pipeline("text-classification", model=model_name)
154
- return model
 
 
 
 
 
155
 
156
-
157
-
158
- # model_1 = load_model(constants.MODEL_NAME, "model_v1.bin")
159
- # model_2 = load_model(constants.MODEL_NAME, "model_v2.bin")
160
- # model_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
161
- # openlid = load_model('laurievb/OpenLID', "model.bin")
162
- # nllb = load_model('facebook/fasttext-language-identification', "model.bin")
163
-
164
-
165
- # MODELS
166
- model_xlmr_large = load_model_pipeline('dsfsi/za-xlmrlarge-lid', "model.bin")
167
- model_serengeti = load_model_pipeline('dsfsi/za-serengeti-lid', "model.bin")
168
- model_afriberta = load_model_pipeline('dsfsi/za-afriberta-lid', "model.bin")
169
- model_afroxlmr_base = load_model_pipeline('dsfsi/za-afro-xlmr-base-lid', "model.bin")
170
- model_afrolm = load_model_pipeline('dsfsi/za-afrolm-lid', "model.bin")
171
- za_lid = load_model_pipeline('dsfsi/za-lid-bert', "model.bin")
172
- openlid = load_model('laurievb/OpenLID', "model.bin")
173
- glotlid_3 = load_model(constants.MODEL_NAME, "model_v3.bin")
174
-
175
-
176
- # @st.cache_resource
177
- def plot(label, prob):
178
-
179
- ORANGE_COLOR = "#FF8000"
180
- BLACK_COLOR = "#31333F"
181
- fig, ax = plt.subplots(figsize=(8, 1))
182
- fig.patch.set_facecolor("none")
183
- ax.set_facecolor("none")
184
-
185
- ax.spines["left"].set_color(BLACK_COLOR)
186
- ax.spines["bottom"].set_color(BLACK_COLOR)
187
- ax.tick_params(axis="x", colors=BLACK_COLOR)
188
-
189
- ax.spines[["right", "top"]].set_visible(False)
190
-
191
- ax.barh(y=[0], width=[prob], color=ORANGE_COLOR)
192
- ax.set_xlim(0, 1)
193
- ax.set_ylim(-1, 1)
194
- ax.set_title(f"Label: {label}, Language: {get_name(label)}", color=BLACK_COLOR)
195
- ax.get_yaxis().set_visible(False)
196
- ax.set_xlabel("Confidence", color=BLACK_COLOR)
197
- st.pyplot(fig)
198
-
199
- # @st.cache_resource
200
- def plot_multiples(models, labels, probs):
201
- ORANGE_COLOR = "#FF8000"
202
- BLACK_COLOR = "#31333F"
203
 
204
- fig, ax = plt.subplots(figsize=(12, len(models)))
205
- fig.patch.set_facecolor("none")
206
- ax.set_facecolor("none")
207
-
208
- ax.spines["left"].set_color(BLACK_COLOR)
209
- ax.spines["bottom"].set_color(BLACK_COLOR)
210
- ax.tick_params(axis="x", colors=BLACK_COLOR)
211
-
212
- ax.spines[["right", "top"]].set_visible(False)
213
-
214
- # Plot bars for each model, label, and probability
215
- y_positions = range(len(models)) # Y positions for each model
216
 
217
- ax.barh(y=y_positions, width=probs, color=ORANGE_COLOR)
 
 
 
 
 
 
 
218
 
219
- # Add labels next to each bar
220
- for i, (prob, label) in enumerate(zip(probs, labels)):
221
- ax.text(prob + 0.01, i, f"{label} ({prob:.2f})", va='center', color=BLACK_COLOR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
- # Set y-ticks and labels
224
- ax.set_yticks(y_positions)
225
- ax.set_yticklabels(models, color=BLACK_COLOR)
226
 
 
 
 
 
 
 
 
 
 
 
227
  ax.set_xlim(0, 1)
228
- ax.set_xlabel("Confidence", color=BLACK_COLOR)
229
- ax.set_title("Model Predictions", color=BLACK_COLOR)
230
-
231
- st.pyplot(fig)
232
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- def compute(sentences, version = 'v3'):
235
- """Computes the language probablities and labels for the given sentences.
236
-
237
- Args:
238
- sentences: A list of sentences.
239
-
240
- Returns:
241
- A list of language probablities and labels for the given sentences.
242
- """
243
- progress_text = "Computing Language..."
244
-
245
- if version == 'xlmrlarge':
246
- model_choice = model_xlmr_large
247
- elif version == 'serengeti':
248
- model_choice = model_serengeti
249
- elif version == 'afriberta':
250
- model_choice = model_afriberta
251
- elif version == 'afroxlmrbase':
252
- model_choice = model_afroxlmr_base
253
- elif version=='afrolm':
254
- model_choice = model_afrolm
255
- elif version == 'BERT':
256
- model_choice = za_lid
257
- elif version == 'openlid-201':
258
- model_choice = openlid
259
- elif version == 'GlotLID v3':
260
- model_choice = glotlid_3
261
- else:
262
- model_choice = [(model_xlmr_large, "xlmrlarge"),(model_serengeti,"serengeti"), (model_afriberta,"afriberta"), (model_afroxlmr_base,"afroxlmrbase"), (model_afrolm,"afrolm"), (za_lid,"BERT"), (openlid,"openlid-201"), (glotlid_3,"GlotLID v3")]
263
 
264
- my_bar = st.progress(0, text=progress_text)
265
-
266
- probs = []
267
- labels = []
268
-
269
- sentences = [preprocess_text(sent) for sent in sentences]
 
270
 
271
- for index, sent in enumerate(sentences):
272
- if type(model_choice) == list:
273
- all_models_pred = []
274
- for model_version in model_choice:
275
- m_version = model_version[1]
276
- model = model_version[0]
277
- if m_version not in ["openlid-201", "GlotLID v3"]:
278
- output = model.predict(sent)
279
- output_label = output[index]['label']
280
- output_prob = output[index]['score']
281
- output_label_language = output[index]['label']
282
- labels = labels + [output_label]
283
- probs = probs + [output_prob]
284
-
285
- my_bar.progress(
286
- min((index) / len(sentences), 1),
287
- text=progress_text,
288
- )
289
- else:
290
- output = model.predict(sent)
291
- output_label = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
292
- output_prob = max(min(output[1][0], 1), 0)
293
- output_label_language = output_label.split('_')[0]
294
-
295
- # script control
296
- if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
297
- main_script, all_scripts = get_script(sent)
298
- output_label_script = output_label.split('_')[1]
299
-
300
- if output_label_script not in all_scripts:
301
- output_label_script = main_script
302
- output_label = f"und_{output_label_script}"
303
- output_prob = 0
304
-
305
-
306
- labels = labels + [output_label]
307
- probs = probs + [output_prob]
308
-
309
- my_bar.progress(
310
- min((index) / len(sentences), 1),
311
- text=progress_text,
312
- )
313
-
314
- else:
315
- if version not in ["openlid-201", "GlotLID v3"]:
316
- output = model_choice.predict(sent)
317
- output_label = output[index]['label']
318
- output_prob = output[index]['score']
319
- output_label_language = output[index]['label']
320
- labels = labels + [output_label]
321
- probs = probs + [output_prob]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
- my_bar.progress(
324
- min((index) / len(sentences), 1),
325
- text=progress_text,
326
- )
327
- else:
328
- output = model_choice.predict(sent)
329
- output_label = output[0][0].split('__')[-1].replace('_Hans', '_Hani').replace('_Hant', '_Hani')
330
- output_prob = max(min(output[1][0], 1), 0)
331
- output_label_language = output_label.split('_')[0]
332
 
333
- # script control
334
- if version in ['GlotLID v3', 'openlid-201', 'nllb-218'] and output_label_language!= 'zxx':
335
- main_script, all_scripts = get_script(sent)
336
- output_label_script = output_label.split('_')[1]
337
 
338
- if output_label_script not in all_scripts:
339
- output_label_script = main_script
340
- output_label = f"und_{output_label_script}"
341
- output_prob = 0
 
 
 
342
 
343
-
344
- labels = labels + [output_label]
345
- probs = probs + [output_prob]
 
 
 
 
346
 
347
- my_bar.progress(
348
- min((index) / len(sentences), 1),
349
- text=progress_text,
350
- )
351
- my_bar.empty()
352
- return probs, labels
353
-
354
- # st.markdown("[![Duplicate Space](https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14)](https://huggingface.co/spaces/cis-lmu/glotlid-space?duplicate=true)")
355
-
356
- # render_svg(open("assets/glotlid_logo.svg").read())
357
-
358
- render_metadata()
359
-
360
- img1, img2, img3 = st.columns(3)
361
- with img2:
362
- with st.container():
363
- st.image("logo_transparent_small.png")
364
 
365
- st.markdown("**DSFSI** Language Identification (LID) Inference Endpoint Created with **HuggingFace Spaces**.")
366
-
367
- with st.expander("More information about the space"):
368
- st.write('''
369
- Authors: Thapelo Sindane, Vukosi Marivate
370
- ''')
371
-
372
-
373
- tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
374
-
375
- with tab1:
376
 
377
- # choice = st.radio(
378
- # "Set granularity level",
379
- # ["default", "merge", "individual"],
380
- # captions=["enable both macrolanguage and its varieties (default)", "merge macrolanguage and its varieties into one label", "remove macrolanguages - only shows individual langauges"],
381
- # )
382
-
383
- version = st.radio(
384
- "Choose model",
385
- ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "openlid-201", "GlotLID v3", "All-Models"],
386
- captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3",'All-Models'],
387
- index = 4,
388
- key = 'version_tab1',
389
- horizontal = True
390
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- sent = st.text_input(
393
- "Sentence:", placeholder="Enter a sentence.", on_change=None
394
- )
395
-
396
- # TODO: Check if this is needed!
397
-
398
- clicked = st.button("Submit")
399
-
400
- if sent:
401
 
402
- probs, labels = compute([sent], version=version)
403
- prob = probs[0]
404
- label = labels[0]
405
-
406
 
407
- # Check if the file exists
408
- if not os.path.exists('logs.txt'):
409
- with open('logs.txt', 'w') as file:
410
- pass
411
-
412
- print(f"{sent}, {label}: {prob}")
413
- with open("logs.txt", "a") as f:
414
- f.write(f"{sent}, {label}: {prob}\n")
415
 
416
- # plot
417
- if version == "All-Models":
418
- plot_multiples(["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT", "OpenLID", "GlotLID v3"], labels, probs)
419
- else:
420
- plot(label, prob)
421
 
422
-
423
- with tab2:
424
-
425
- version = st.radio(
426
- "Choose model",
427
- ["xlmrlarge", "serengeti", "afriberta", "afroxlmrbase", "afrolm", "BERT","openlid-201", "GlotLID v3", "All-Models"],
428
- captions=["za-XLMR-Large", "za-Serengeti", "za-AfriBERTa", "za-Afro-XLMR-BASE", "za-AfroLM", "za-BERT", "OpenLID", "GlotLID v3", "All-Models"],
429
- index = 4,
430
- key = 'version_tab2',
431
- horizontal = True
432
- )
433
-
434
- file = st.file_uploader("Upload a file", type=["txt"])
435
- if file is not None:
436
- df = pd.read_csv(file, sep="¦\t¦", header=None, engine='python')
437
- df.columns = ["Sentence"]
438
- df.reset_index(drop=True, inplace=True)
439
-
440
- # TODO: Run the model
441
- df['Prob'], df["Label"] = compute(df["Sentence"].tolist(), version= version)
442
- df['Language'] = df["Label"].apply(get_name)
443
-
444
- # A horizontal rule
445
- st.markdown("""---""")
446
-
447
- chart = (
448
- alt.Chart(df.reset_index())
449
- .mark_area(color="darkorange", opacity=0.5)
450
- .encode(
451
- x=X(field="index", title="Sentence Index"),
452
- y=Y("Prob", scale=Scale(domain=[0, 1])),
453
- )
454
- )
455
- st.altair_chart(chart.interactive(), use_container_width=True)
456
-
457
- col1, col2 = st.columns([4, 1])
458
-
459
- with col1:
460
- # Display the output
461
- st.table(
462
- df,
463
- )
464
-
465
- with col2:
466
- # Add a download button
467
- csv = convert_df(df)
468
- st.download_button(
469
- label=":file_folder: Download predictions as CSV",
470
- data=csv,
471
- file_name="GlotLID.csv",
472
- mime="text/csv",
473
- )
474
-
475
-
476
-
477
- # citation()
 
1
  # coding=utf-8
 
 
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
 
 
 
5
  import altair as alt
6
+ from transformers import pipeline
7
+ import fasttext
8
+ from huggingface_hub import hf_hub_download
9
  import json
10
  import os
11
  import re
12
+ import string
13
+ import base64
14
+ from typing import List, Tuple, Dict, Optional
15
+ import logging
16
+
17
+ # Configure page
18
+ st.set_page_config(
19
+ page_title="South African Language Identification",
20
+ page_icon="πŸ‡ΏπŸ‡¦",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded"
23
+ )
24
+
25
+ # Custom CSS for better styling
26
+ st.markdown("""
27
+ <style>
28
+ .main-header {
29
+ text-align: center;
30
+ padding: 1rem 0;
31
+ background: linear-gradient(90deg, #ff6b35, #f7931e);
32
+ color: white;
33
+ border-radius: 10px;
34
+ margin-bottom: 2rem;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
+ .model-card {
37
+ background: #f8f9fa;
38
+ padding: 1rem;
39
+ border-radius: 8px;
40
+ border-left: 4px solid #ff6b35;
41
+ margin: 1rem 0;
42
+ }
43
+ .result-container {
44
+ background: white;
45
+ padding: 1.5rem;
46
+ border-radius: 10px;
47
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
48
+ margin: 1rem 0;
49
+ }
50
+ .metric-card {
51
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
52
+ color: white;
53
+ padding: 1rem;
54
+ border-radius: 8px;
55
+ text-align: center;
56
+ }
57
+ </style>
58
+ """, unsafe_allow_html=True)
59
+
60
+ # Constants and Configuration
61
+ MODEL_CONFIGS = {
62
+ "za-bert": {
63
+ "name": "ZA-BERT",
64
+ "model_id": "dsfsi/za-lid-bert",
65
+ "description": "Lightweight BERT-based model trained on South African languages",
66
+ "recommended": True
67
+ },
68
+ "xlmr-large": {
69
+ "name": "XLM-R Large",
70
+ "model_id": "dsfsi/za-xlmrlarge-lid",
71
+ "description": "XLM-RoBERTa Large model fine-tuned for SA languages"
72
+ },
73
+ "serengeti": {
74
+ "name": "Serengeti",
75
+ "model_id": "dsfsi/za-serengeti-lid",
76
+ "description": "Afri-centric model with superior performance"
77
+ },
78
+ "afriberta": {
79
+ "name": "AfriBERTa",
80
+ "model_id": "dsfsi/za-afriberta-lid",
81
+ "description": "African-focused BERT model"
82
+ },
83
+ "afro-xlmr": {
84
+ "name": "Afro-XLM-R",
85
+ "model_id": "dsfsi/za-afro-xlmr-base-lid",
86
+ "description": "African-centric XLM-RoBERTa model"
87
+ },
88
+ "afrolm": {
89
+ "name": "AfroLM",
90
+ "model_id": "dsfsi/za-afrolm-lid",
91
+ "description": "African language model"
92
+ }
93
+ }
 
 
 
94
 
95
+ # Utility Functions
96
  @st.cache_data
97
+ def load_language_names() -> Dict[str, str]:
98
+ """Load language names mapping"""
99
+ try:
100
+ with open("assets/language_names.json", 'r') as f:
101
+ return json.load(f)
102
+ except FileNotFoundError:
103
+ # Fallback mapping for common South African languages
104
+ return {
105
+ "afr": "Afrikaans",
106
+ "eng": "English",
107
+ "nso": "Northern Sotho",
108
+ "sot": "Sesotho",
109
+ "ssw": "Siswati",
110
+ "tsn": "Setswana",
111
+ "tso": "Xitsonga",
112
+ "ven": "Tshivenda",
113
+ "xho": "isiXhosa",
114
+ "zul": "isiZulu",
115
+ "nbl": "isiNdebele",
116
+ "und": "Undetermined"
117
+ }
118
 
119
  @st.cache_resource
120
+ def load_model(model_key: str):
121
+ """Load and cache models"""
122
+ try:
123
+ config = MODEL_CONFIGS[model_key]
124
+ model = pipeline("text-classification", model=config["model_id"])
125
+ return model
126
+ except Exception as e:
127
+ st.error(f"Error loading model {model_key}: {str(e)}")
128
+ return None
129
+
130
+ def preprocess_text(text: str) -> str:
131
+ """Clean and preprocess input text"""
132
+ if not text or not text.strip():
133
+ return ""
134
 
135
+ # Basic cleaning
136
+ text = text.replace('\n', ' ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ # Remove problematic characters
139
+ replacement_map = {ord(c): ' ' for c in ':β€’#{|}' + string.digits}
140
+ text = text.translate(replacement_map)
141
+
142
+ # Normalize whitespace
143
+ text = re.sub(r'\s+', ' ', text).strip()
144
+
145
+ return text
 
 
 
 
146
 
147
+ def get_language_name(label: str, lang_names: Dict[str, str]) -> str:
148
+ """Get language name from label"""
149
+ if '_' in label:
150
+ iso_code = label.split('_')[0]
151
+ else:
152
+ iso_code = label
153
+
154
+ return lang_names.get(iso_code, label)
155
 
156
+ def predict_language(text: str, model, lang_names: Dict[str, str]) -> Tuple[str, float, str]:
157
+ """Predict language for given text"""
158
+ if not model or not text.strip():
159
+ return "und", 0.0, "Undetermined"
160
+
161
+ try:
162
+ processed_text = preprocess_text(text)
163
+ if not processed_text:
164
+ return "und", 0.0, "Undetermined"
165
+
166
+ result = model(processed_text)
167
+ if isinstance(result, list) and len(result) > 0:
168
+ prediction = result[0]
169
+ label = prediction['label']
170
+ confidence = prediction['score']
171
+ language_name = get_language_name(label, lang_names)
172
+ return label, confidence, language_name
173
+
174
+ return "und", 0.0, "Undetermined"
175
+
176
+ except Exception as e:
177
+ st.error(f"Prediction error: {str(e)}")
178
+ return "und", 0.0, "Error"
179
 
180
+ def create_confidence_plot(language: str, confidence: float) -> plt.Figure:
181
+ """Create a confidence visualization"""
182
+ fig, ax = plt.subplots(figsize=(10, 2))
183
 
184
+ # Colors
185
+ primary_color = "#ff6b35"
186
+ bg_color = "#f8f9fa"
187
+ text_color = "#2c3e50"
188
+
189
+ # Create horizontal bar
190
+ ax.barh([0], [confidence], color=primary_color, height=0.6, alpha=0.8)
191
+ ax.barh([0], [1-confidence], left=[confidence], color=bg_color, height=0.6, alpha=0.3)
192
+
193
+ # Styling
194
  ax.set_xlim(0, 1)
195
+ ax.set_ylim(-0.5, 0.5)
196
+ ax.set_xlabel("Confidence Score", fontsize=12, color=text_color)
197
+ ax.set_title(f"Language: {language} (Confidence: {confidence:.3f})",
198
+ fontsize=14, fontweight='bold', color=text_color, pad=20)
199
 
200
+ # Remove y-axis and spines
201
+ ax.set_yticks([])
202
+ ax.spines['top'].set_visible(False)
203
+ ax.spines['right'].set_visible(False)
204
+ ax.spines['left'].set_visible(False)
205
+
206
+ # Add confidence text
207
+ ax.text(confidence/2, 0, f"{confidence:.1%}",
208
+ ha='center', va='center', fontweight='bold', color='white')
209
+
210
+ plt.tight_layout()
211
+ return fig
212
 
213
+ def render_paper_info():
214
+ """Render paper information and citation"""
215
+ st.markdown("### πŸ“„ Research Paper")
216
+
217
+ col1, col2 = st.columns([2, 1])
218
+
219
+ with col1:
220
+ st.markdown("""
221
+ **"From N-grams to Pre-trained Multilingual Models For Language Identification"**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ *Authors: Thapelo Andrew Sindane, Vukosi Marivate*
224
+
225
+ Published in: Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities (2024)
226
+
227
+ This research investigates N-gram models and large pre-trained multilingual models for Language Identification
228
+ across 11 South African languages, showing that Serengeti performs best across all model types.
229
+ """)
230
 
231
+ with col2:
232
+ st.markdown("""
233
+ **Links:**
234
+ - [πŸ“– Paper](https://aclanthology.org/2024.nlp4dh-1.22/)
235
+ - [πŸ€— HuggingFace](https://huggingface.co/dsfsi)
236
+ - [πŸ’» GitHub](https://github.com/dsfsi/za-lid)
237
+ """)
238
+
239
+ def render_citation():
240
+ """Render BibTeX citation"""
241
+ citation = """@inproceedings{sindane-marivate-2024-n,
242
+ title = "From N-grams to Pre-trained Multilingual Models For Language Identification",
243
+ author = "Sindane, Thapelo Andrew and Marivate, Vukosi",
244
+ editor = "HΓ€mΓ€lΓ€inen, Mika and Γ–hman, Emily and Miyagawa, So and Alnajjar, Khalid and Bizzoni, Yuri",
245
+ booktitle = "Proceedings of the 4th International Conference on Natural Language Processing for Digital Humanities",
246
+ month = nov,
247
+ year = "2024",
248
+ address = "Miami, USA",
249
+ publisher = "Association for Computational Linguistics",
250
+ url = "https://aclanthology.org/2024.nlp4dh-1.22/",
251
+ doi = "10.18653/v1/2024.nlp4dh-1.22",
252
+ pages = "229--239"
253
+ }"""
254
+
255
+ st.code(citation, language='bibtex')
256
+
257
+ def main():
258
+ # Header
259
+ st.markdown("""
260
+ <div class="main-header">
261
+ <h1>πŸ‡ΏπŸ‡¦ South African Language Identification</h1>
262
+ <p>Multilingual Language Detection for South African Languages</p>
263
+ </div>
264
+ """, unsafe_allow_html=True)
265
+
266
+ # Load language names
267
+ lang_names = load_language_names()
268
+
269
+ # Sidebar
270
+ with st.sidebar:
271
+ st.header("βš™οΈ Model Configuration")
272
+
273
+ # Model selection
274
+ selected_model = st.selectbox(
275
+ "Choose Model:",
276
+ options=list(MODEL_CONFIGS.keys()),
277
+ format_func=lambda x: f"{'⭐ ' if MODEL_CONFIGS[x].get('recommended') else ''}{MODEL_CONFIGS[x]['name']}",
278
+ index=0,
279
+ help="Select the language identification model"
280
+ )
281
+
282
+ # Model info
283
+ model_config = MODEL_CONFIGS[selected_model]
284
+ st.markdown(f"""
285
+ <div class="model-card">
286
+ <h4>{model_config['name']}</h4>
287
+ <p>{model_config['description']}</p>
288
+ </div>
289
+ """, unsafe_allow_html=True)
290
+
291
+ # Supported languages
292
+ st.subheader("πŸ“‹ Supported Languages")
293
+ supported_langs = [
294
+ "🏴󠁺󠁑󠁺󠁑󠁿 Afrikaans", "πŸ‡¬πŸ‡§ English", "🌍 Northern Sotho",
295
+ "🌍 Sesotho", "🌍 Siswati", "🌍 Setswana",
296
+ "🌍 Xitsonga", "🌍 Tshivenda", "🌍 isiXhosa",
297
+ "🌍 isiZulu", "🌍 isiNdebele"
298
+ ]
299
+ for lang in supported_langs:
300
+ st.write(f"β€’ {lang}")
301
+
302
+ # Main content
303
+ tab1, tab2, tab3 = st.tabs(["πŸ” Single Text", "πŸ“ Bulk Analysis", "πŸ“„ About"])
304
+
305
+ with tab1:
306
+ st.header("Single Text Analysis")
307
+
308
+ # Text input
309
+ user_text = st.text_area(
310
+ "Enter text to identify language:",
311
+ placeholder="Type or paste your text here...",
312
+ height=100,
313
+ help="Enter text in any South African language"
314
+ )
315
+
316
+ col1, col2, col3 = st.columns([1, 1, 2])
317
+
318
+ with col1:
319
+ analyze_button = st.button("πŸ” Analyze", type="primary", use_container_width=True)
320
+
321
+ with col2:
322
+ clear_button = st.button("πŸ—‘οΈ Clear", use_container_width=True)
323
+ if clear_button:
324
+ st.rerun()
325
+
326
+ if analyze_button and user_text.strip():
327
+ with st.spinner("Analyzing language..."):
328
+ # Load model
329
+ model = load_model(selected_model)
330
+
331
+ if model:
332
+ # Predict
333
+ label, confidence, language_name = predict_language(user_text, model, lang_names)
334
 
335
+ # Results
336
+ st.markdown("### πŸ“Š Results")
 
 
 
 
 
 
 
337
 
338
+ # Metrics
339
+ col1, col2, col3 = st.columns(3)
 
 
340
 
341
+ with col1:
342
+ st.markdown(f"""
343
+ <div class="metric-card">
344
+ <h3>{language_name}</h3>
345
+ <p>Detected Language</p>
346
+ </div>
347
+ """, unsafe_allow_html=True)
348
 
349
+ with col2:
350
+ st.markdown(f"""
351
+ <div class="metric-card">
352
+ <h3>{confidence:.1%}</h3>
353
+ <p>Confidence</p>
354
+ </div>
355
+ """, unsafe_allow_html=True)
356
 
357
+ with col3:
358
+ st.markdown(f"""
359
+ <div class="metric-card">
360
+ <h3>{label}</h3>
361
+ <p>Language Code</p>
362
+ </div>
363
+ """, unsafe_allow_html=True)
364
+
365
+ # Confidence visualization
366
+ st.markdown("### πŸ“ˆ Confidence Visualization")
367
+ fig = create_confidence_plot(language_name, confidence)
368
+ st.pyplot(fig)
369
+
370
+ else:
371
+ st.error("Failed to load the model. Please try again.")
 
 
372
 
373
+ elif analyze_button:
374
+ st.warning("Please enter some text to analyze.")
 
 
 
 
 
 
 
 
 
375
 
376
+ with tab2:
377
+ st.header("Bulk Text Analysis")
378
+
379
+ uploaded_file = st.file_uploader(
380
+ "Upload a text file",
381
+ type=['txt', 'csv'],
382
+ help="Upload a .txt file with one sentence per line, or a CSV file with a 'text' column"
383
+ )
384
+
385
+ if uploaded_file:
386
+ try:
387
+ # Read file
388
+ if uploaded_file.name.endswith('.csv'):
389
+ df = pd.read_csv(uploaded_file)
390
+ if 'text' not in df.columns:
391
+ st.error("CSV file must contain a 'text' column")
392
+ st.stop()
393
+ texts = df['text'].astype(str).tolist()
394
+ else:
395
+ content = uploaded_file.read().decode('utf-8')
396
+ texts = [line.strip() for line in content.split('\n') if line.strip()]
397
+
398
+ st.success(f"Loaded {len(texts)} texts for analysis")
399
+
400
+ if st.button("πŸš€ Analyze All", type="primary"):
401
+ model = load_model(selected_model)
402
+
403
+ if model:
404
+ results = []
405
+ progress_bar = st.progress(0)
406
+
407
+ for i, text in enumerate(texts):
408
+ label, confidence, language_name = predict_language(text, model, lang_names)
409
+ results.append({
410
+ 'Text': text[:100] + '...' if len(text) > 100 else text,
411
+ 'Language': language_name,
412
+ 'Code': label,
413
+ 'Confidence': confidence
414
+ })
415
+ progress_bar.progress((i + 1) / len(texts))
416
+
417
+ # Results DataFrame
418
+ results_df = pd.DataFrame(results)
419
+
420
+ # Display results
421
+ st.markdown("### πŸ“Š Analysis Results")
422
+ st.dataframe(results_df, use_container_width=True)
423
+
424
+ # Summary statistics
425
+ col1, col2 = st.columns(2)
426
+
427
+ with col1:
428
+ st.markdown("### πŸ“ˆ Language Distribution")
429
+ lang_counts = results_df['Language'].value_counts()
430
+ st.bar_chart(lang_counts)
431
+
432
+ with col2:
433
+ st.markdown("### πŸ“Š Average Confidence by Language")
434
+ avg_conf = results_df.groupby('Language')['Confidence'].mean().sort_values(ascending=False)
435
+ st.bar_chart(avg_conf)
436
+
437
+ # Download button
438
+ csv_data = results_df.to_csv(index=False)
439
+ st.download_button(
440
+ label="πŸ“₯ Download Results (CSV)",
441
+ data=csv_data,
442
+ file_name="language_identification_results.csv",
443
+ mime="text/csv"
444
+ )
445
+
446
+ else:
447
+ st.error("Failed to load the model.")
448
+
449
+ except Exception as e:
450
+ st.error(f"Error processing file: {str(e)}")
451
 
452
+ with tab3:
453
+ render_paper_info()
 
 
 
 
 
 
 
454
 
455
+ st.markdown("---")
 
 
 
456
 
457
+ st.markdown("### πŸ“– Citation")
458
+ render_citation()
 
 
 
 
 
 
459
 
460
+ st.markdown("---")
 
 
 
 
461
 
462
+ st.markdown("""
463
+ ### πŸ›οΈ Acknowledgments
464
+
465
+ This work is part of the Data Science for Social Impact Research Group at the University of Pretoria.
466
+
467
+ **Contact:**
468
+ - πŸ“§ Email: [email protected].za
469
+ - 🐦 Twitter: [@VukosiiM](https://twitter.com/VukosiiM)
470
+ - 🌐 Website: [dsfsi.github.io](https://dsfsi.github.io)
471
+ """)
472
+
473
+ if __name__ == "__main__":
474
+ main()