abdullahmubeen10 commited on
Commit
c8cb717
·
verified ·
1 Parent(s): e11fca5

Update Demo.py

Browse files
Files changed (1) hide show
  1. Demo.py +556 -553
Demo.py CHANGED
@@ -1,553 +1,556 @@
1
- import streamlit as st
2
- import sparknlp
3
- import os
4
- import pandas as pd
5
-
6
- from sparknlp.base import *
7
- from sparknlp.annotator import *
8
- from pyspark.ml import Pipeline
9
- from sparknlp.pretrained import PretrainedPipeline
10
- from annotated_text import annotated_text
11
-
12
- # Page configuration
13
- st.set_page_config(
14
- layout="wide",
15
- page_title="Spark NLP Demos App",
16
- initial_sidebar_state="auto"
17
- )
18
-
19
- # CSS for styling
20
- st.markdown("""
21
- <style>
22
- .main-title {
23
- font-size: 36px;
24
- color: #4A90E2;
25
- font-weight: bold;
26
- text-align: center;
27
- }
28
- .section p, .section ul {
29
- color: #666666;
30
- }
31
- .stTable {
32
- margin-left: auto;
33
- margin-right: auto;
34
- }
35
- </style>
36
- """, unsafe_allow_html=True)
37
-
38
- @st.cache_resource
39
- def init_spark():
40
- return sparknlp.start()
41
-
42
- @st.cache_resource
43
- def create_pipeline(model):
44
- documentAssembler = DocumentAssembler()\
45
- .setInputCol("text")\
46
- .setOutputCol("document")
47
-
48
- sentence_detector = SentenceDetector() \
49
- .setInputCols(["document"]) \
50
- .setOutputCol("sentence")
51
-
52
- languageDetector = LanguageDetectorDL.pretrained(model)\
53
- .setInputCols("sentence")\
54
- .setOutputCol("language")\
55
- .setThreshold(0.5)\
56
- .setCoalesceSentences(True)
57
-
58
- nlpPipeline = Pipeline(
59
- stages=[
60
- documentAssembler,
61
- sentence_detector,
62
- languageDetector])
63
-
64
- return nlpPipeline
65
-
66
- def fit_data(pipeline, data):
67
- empty_df = spark.createDataFrame([['']]).toDF('text')
68
- pipeline_model = pipeline.fit(empty_df)
69
- model = LightPipeline(pipeline_model)
70
- results = model.fullAnnotate(data)[0]
71
-
72
- return results
73
-
74
- # Set up the page layout
75
- st.markdown('<div class="main-title">State-Of-The-Art Language Detection With Spark NLP</div>', unsafe_allow_html=True)
76
- st.subheader('Support for 375 different languages')
77
-
78
- # Sidebar content
79
- model = st.sidebar.selectbox(
80
- "Choose the pretrained model",
81
- ["ld_wiki_tatoeba_cnn_375"],
82
- help="For more info about the models visit: https://sparknlp.org/models"
83
- )
84
-
85
- with st.expander("View Supported Languges"):
86
- st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.")
87
-
88
- # Reference notebook link in sidebar
89
- link = """
90
- <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Language_Detector.ipynb">
91
- <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
92
- </a>
93
- """
94
- st.sidebar.markdown('Reference notebook:')
95
- st.sidebar.markdown(link, unsafe_allow_html=True)
96
-
97
- # Load examples
98
- folder_path = f"inputs/{model}"
99
- examples = [
100
- lines[1].strip()
101
- for filename in os.listdir(folder_path)
102
- if filename.endswith('.txt')
103
- for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
104
- if len(lines) >= 2
105
- ]
106
-
107
- selected_text = st.selectbox("Select a sample text", examples)
108
- custom_input = st.text_input("Try it for yourself!")
109
-
110
- if custom_input:
111
- selected_text = custom_input
112
- elif selected_text:
113
- selected_text = selected_text
114
-
115
- st.subheader('Selected Text')
116
- st.markdown("""<div class="section">{selected_text}</div>""", unsafe_allow_html=True)
117
-
118
- # Initialize Spark and create pipeline
119
- spark = init_spark()
120
- pipeline = create_pipeline(model)
121
- output = fit_data(pipeline, selected_text)
122
-
123
- # Display output
124
- language_map = {
125
- 'ab': "Abkhaz",
126
- 'ace': "Achinese",
127
- 'acm': "Iraqi Arabic",
128
- 'ady': "Adyghe",
129
- 'af': "Afrikaans",
130
- 'afb': "Gulf Arabic",
131
- 'afh': "Afrihili",
132
- 'aii': "Assyrian Neo-Aramaic",
133
- 'ain': "Ainu",
134
- 'akl': "Aklanon",
135
- 'aln': "Gheg Albanian",
136
- 'als': "Tosk Albanian",
137
- 'am': "Amharic",
138
- 'an': "Aragonese",
139
- 'ang': "Old English",
140
- 'aoz': "Uab Meto",
141
- 'apc': "North Levantine Arabic",
142
- 'ar': "Arabic",
143
- 'arq': "Algerian Arabic",
144
- 'ary': "Moroccan Arabic",
145
- 'arz': "Egyptian Arabic",
146
- 'as': "Assamese",
147
- 'ast': "Asturian",
148
- 'av': "Avaric",
149
- 'avk': "Kotava",
150
- 'awa': "Awadhi",
151
- 'ay': "Aymara",
152
- 'az': "Azerbaijani",
153
- 'azb': "South Azerbaijani",
154
- 'ba': "Bashkir",
155
- 'bal': "Baluchi",
156
- 'ban': "Balinese",
157
- 'bar': "Bavarian",
158
- 'bat-smg': "bat-smg",
159
- 'bcl': "Central Bikol",
160
- 'be': "Belarusian",
161
- 'ber': "Berber",
162
- 'bg': "Bulgarian",
163
- 'bh': "bh",
164
- 'bho': "Bhojpuri",
165
- 'bi': "Bislama",
166
- 'bjn': "Banjar",
167
- 'bm': "Bambara",
168
- 'bn': "Bengali",
169
- 'bo': "Tibetan",
170
- 'bpy': "Bishnupriya",
171
- 'br': "Breton",
172
- 'brx': "Bodo",
173
- 'bs': "Bosnian",
174
- 'bua': "Buryat",
175
- 'bvy': "Baybayanon",
176
- 'bxr': "Russia Buriat",
177
- 'bzt': "Brithenig",
178
- 'ca': "Catalan",
179
- 'cay': "Cayuga",
180
- 'cbk': "Chavacano",
181
- 'cbk-zam': "cbk-zam",
182
- 'cdo': "Min Dong Chinese",
183
- 'ce': "Chechen",
184
- 'ceb': "Cebuano",
185
- 'ch': "Chamorro",
186
- 'chg': "Chagatai",
187
- 'chn': "Chinook Jargon",
188
- 'cho': "Choctaw",
189
- 'chr': "Cherokee",
190
- 'cjy': "Jin Chinese",
191
- 'ckb': "Central Kurdish (Soranî)",
192
- 'ckt': "Chukchi",
193
- 'cmo': "Central Mnong",
194
- 'co': "Corsican",
195
- 'cpi': "Chinese Pidgin English",
196
- 'crh': "Crimean Tatar",
197
- 'crs': "Seychellois Creole",
198
- 'cs': "Czech",
199
- 'ces': "Czech",
200
- 'csb': "Kashubian",
201
- 'cv': "Chuvash",
202
- 'cy': "Welsh",
203
- 'cycl': "CycL",
204
- 'cyo': "Cuyonon",
205
- 'da': "Danish",
206
- 'de': "German",
207
- 'deu': "German",
208
- 'diq': "Dimli (individual language)",
209
- 'dng': "Dungan",
210
- 'drt': "Drents",
211
- 'dsb': "Lower Sorbian",
212
- 'dtp': "Central Dusun",
213
- 'dty': "dty",
214
- 'dv': "Dhivehi",
215
- 'dws': "Dutton World Speedwords",
216
- 'ee': "Ewe",
217
- 'egl': "Emilian",
218
- 'el': "Greek",
219
- 'ell': "Greek",
220
- 'eml': "eml",
221
- 'emx': "Erromintxela",
222
- 'en': "English",
223
- 'enm': "Middle English",
224
- 'eo': "Esperanto",
225
- 'es': "Spanish",
226
- 'et': "Estonian",
227
- 'eu': "Basque",
228
- 'evn': "Evenki",
229
- 'ext': "Extremaduran",
230
- 'fa': "Persian",
231
- 'fi': "Finnish",
232
- 'fiu-vro': "fiu-vro",
233
- 'fj': "Fijian",
234
- 'fkv': "Kven Finnish",
235
- 'fo': "Faroese",
236
- 'fr': "French",
237
- 'fra': "French",
238
- 'frm': "Middle French",
239
- 'fro': "Old French",
240
- 'frp': "Arpitan",
241
- 'frr': "North Frisian",
242
- 'fuc': "Pulaar",
243
- 'fur': "Friulian",
244
- 'fuv': "Nigerian Fulfulde",
245
- 'fy': "Frisian",
246
- 'ga': "Irish",
247
- 'gaa': "Ga",
248
- 'gag': "Gagauz",
249
- 'gan': "Gan Chinese",
250
- 'gbm': "Garhwali",
251
- 'gcf': "Guadeloupean Creole French",
252
- 'gd': "Scottish Gaelic",
253
- 'gil': "Gilbertese",
254
- 'gl': "Galician",
255
- 'glk': "Gilaki",
256
- 'gn': "Guarani",
257
- 'gom': "Konkani (Goan)",
258
- 'gos': "Gronings",
259
- 'got': "Gothic",
260
- 'grc': "Ancient Greek",
261
- 'gsw': "Swiss German",
262
- 'gu': "Gujarati",
263
- 'gv': "Manx",
264
- 'ha': "Hausa",
265
- 'hak': "Hakka Chinese",
266
- 'haw': "Hawaiian",
267
- 'hbo': "Ancient Hebrew",
268
- 'he': "Hebrew",
269
- 'hi': "Hindi",
270
- 'hif': "Fiji Hindi",
271
- 'hil': "Hiligaynon",
272
- 'hnj': "Hmong Njua (Green)",
273
- 'hoc': "Ho",
274
- 'hr': "Croatian",
275
- 'hrx': "Hunsrik",
276
- 'hsb': "Upper Sorbian",
277
- 'hsn': "Xiang Chinese",
278
- 'ht': "Haitian Creole",
279
- 'hu': "Hungarian",
280
- 'hy': "Armenian",
281
- 'ia': "Interlingua",
282
- 'iba': "Iban",
283
- 'id': "Indonesian",
284
- 'ie': "Interlingue",
285
- 'ig': "Igbo",
286
- 'ii': "Nuosu",
287
- 'ike': "Inuktitut",
288
- 'ilo': "Ilocano",
289
- 'io': "Ido",
290
- 'is': "Icelandic",
291
- 'it': "Italian",
292
- 'izh': "Ingrian",
293
- 'ja': "Japanese",
294
- 'jam': "Jamaican Patois",
295
- 'jbo': "Lojban",
296
- 'jdt': "Juhuri (Judeo-Tat)",
297
- 'jpa': "Jewish Palestinian Aramaic",
298
- 'jv': "Javanese",
299
- 'ka': "Georgian",
300
- 'kaa': "Karakalpak",
301
- 'kab': "Kabyle",
302
- 'kam': "Kamba",
303
- 'kbd': "Kabardian",
304
- 'kek': "Kekchi (Q'eqchi')",
305
- 'kha': "Khasi",
306
- 'kjh': "Khakas",
307
- 'kk': "Kazakh",
308
- 'kl': "Greenlandic",
309
- 'km': "Khmer",
310
- 'kn': "Kannada",
311
- 'ko': "Korean",
312
- 'koi': "Komi-Permyak",
313
- 'kpv': "Komi-Zyrian",
314
- 'krc': "Karachay-Balkar",
315
- 'krl': "Karelian",
316
- 'ks': "Kashmiri",
317
- 'ksh': "Kölsch",
318
- 'ku': "Kurdish",
319
- 'kum': "Kumyk",
320
- 'kv': "Komi",
321
- 'kw': "Cornish",
322
- 'kxi': "Keningau Murut",
323
- 'ky': "Kyrgyz",
324
- 'kzj': "Coastal Kadazan",
325
- 'la': "Latin",
326
- 'laa': "Southern Subanen",
327
- 'lad': "Ladino",
328
- 'lb': "Luxembourgish",
329
- 'ldn': "Láadan",
330
- 'lez': "Lezghian",
331
- 'lfn': "Lingua Franca Nova",
332
- 'lg': "Luganda",
333
- 'li': "Limburgan",
334
- 'lij': "Ligurian",
335
- 'liv': "Livonian",
336
- 'lkt': "Lakota",
337
- 'lld': "Ladin",
338
- 'lmo': "Lombard",
339
- 'ln': "Lingala",
340
- 'lo': "Lao",
341
- 'lou': "Louisiana Creole",
342
- 'lrc': "Northern Luri",
343
- 'lt': "Lithuanian",
344
- 'ltg': "Latgalian",
345
- 'lv': "Latvian",
346
- 'lvs': "Latvian",
347
- 'lzh': "Literary Chinese",
348
- 'lzz': "Laz",
349
- 'mad': "Madurese",
350
- 'mai': "Maithili",
351
- 'map-bms': "map-bms",
352
- 'max': "North Moluccan Malay",
353
- 'mdf': "Moksha",
354
- 'mfe': "Morisyen",
355
- 'mg': "Malagasy",
356
- 'mgm': "Mambae",
357
- 'mh': "Marshallese",
358
- 'mhr': "Meadow Mari",
359
- 'mi': "Maori",
360
- 'mic': "Mi'kmaq",
361
- 'min': "Minangkabau",
362
- 'mk': "Macedonian",
363
- 'ml': "Malayalam",
364
- 'mn': "Mongolian",
365
- 'mnc': "Manchu",
366
- 'mnw': "Mon",
367
- 'moh': "Mohawk",
368
- 'mr': "Marathi",
369
- 'mrj': "Hill Mari",
370
- 'ms': "Malay",
371
- 'mt': "Maltese",
372
- 'mvv': "Tagal Murut",
373
- 'mwl': "Mirandese",
374
- 'mww': "Hmong Daw (White)",
375
- 'my': "Burmese",
376
- 'myv': "Erzya",
377
- 'mzn': "Mazanderani",
378
- 'na': "Nauruan",
379
- 'nah': "Nahuatl",
380
- 'nap': "Neapolitan",
381
- 'nb': "Norwegian Bokmål",
382
- 'nch': "Central Huasteca Nahuatl",
383
- 'nds': "Low German (Low Saxon)",
384
- 'nds-nl': "nds-nl",
385
- 'ne': "Nepali",
386
- 'new': "Newari",
387
- 'ngt': "Ngeq",
388
- 'ngu': "Guerrero Nahuatl",
389
- 'niu': "Niuean",
390
- 'nl': "Dutch",
391
- 'nlv': "Orizaba Nahuatl",
392
- 'nn': "Norwegian Nynorsk",
393
- 'no': "Norwegian",
394
- 'nog': "Nogai",
395
- 'non': "Old Norse",
396
- 'nov': "Novial",
397
- 'npi': "Nepali",
398
- 'nrm': "Narom",
399
- 'nso': "Pedi",
400
- 'nst': "Naga (Tangshang)",
401
- 'nv': "Navajo",
402
- 'ny': "Chinyanja",
403
- 'nys': "Nyungar",
404
- 'oar': "Old Aramaic",
405
- 'oc': "Occitan",
406
- 'oj': "Ojibwe",
407
- 'olo': "Livvi",
408
- 'om': "Oromo",
409
- 'or': "Odia (Oriya)",
410
- 'orv': "Old East Slavic",
411
- 'os': "Ossetian",
412
- 'osp': "Old Spanish",
413
- 'osx': "Old Saxon",
414
- 'ota': "Ottoman Turkish",
415
- 'otk': "Old Turkish",
416
- 'pa': "Punjabi (Eastern)",
417
- 'pag': "Pangasinan",
418
- 'pam': "Kapampangan",
419
- 'pap': "Papiamento",
420
- 'pau': "Palauan",
421
- 'pcd': "Picard",
422
- 'pdc': "Pennsylvania German",
423
- 'pfl': "Palatine German",
424
- 'phn': "Phoenician",
425
- 'pi': "Pali",
426
- 'pl': "Polish",
427
- 'pms': "Piedmontese",
428
- 'pnb': "Punjabi (Western)",
429
- 'ppl': "Pipil",
430
- 'prg': "Old Prussian",
431
- 'ps': "Pashto",
432
- 'pt': "Portuguese",
433
- 'qu': "Quechua",
434
- 'quc': "K'iche'",
435
- 'qya': "Quenya",
436
- 'rap': "Rapa Nui",
437
- 'rel': "Rendille",
438
- 'rif': "Tarifit",
439
- 'rm': "Romansh",
440
- 'rn': "Kirundi",
441
- 'ro': "Romanian",
442
- 'ron': "Romanian",
443
- 'roa-rup': "roa-rup",
444
- 'roa-tara': "roa-tara",
445
- 'rom': "Romani",
446
- 'ru': "Russian",
447
- 'rue': "Rusyn",
448
- 'rw': "Kinyarwanda",
449
- 'ryu': "Okinawan",
450
- 'sa': "Sanskrit",
451
- 'sah': "Yakut",
452
- 'sc': "Sardinian",
453
- 'scn': "Sicilian",
454
- 'sco': "Scots",
455
- 'sd': "Sindhi",
456
- 'se': "Northern Sami",
457
- 'sg': "Sango",
458
- 'sgs': "Samogitian",
459
- 'sh': "Serbo-Croatian",
460
- 'shs': "Shuswap",
461
- 'shy': "Tachawit",
462
- 'si': "Sinhala",
463
- 'sjn': "Sindarin",
464
- 'sk': "Slovak",
465
- 'slk': "Slovak",
466
- 'sl': "Slovenian",
467
- 'sm': "Samoan",
468
- 'sma': "Southern Sami",
469
- 'sn': "Shona",
470
- 'so': "Somali",
471
- 'sq': "Albanian",
472
- 'sr': "Serbian",
473
- 'srn': "Sranan Tongo",
474
- 'ss': "Swazi",
475
- 'st': "Southern Sotho",
476
- 'stq': "Saterland Frisian",
477
- 'su': "Sundanese",
478
- 'sux': "Sumerian",
479
- 'sv': "Swedish",
480
- 'sw': "Swahili",
481
- 'swg': "Swabian",
482
- 'swh': "Swahili",
483
- 'syc': "Syriac",
484
- 'szl': "Silesian",
485
- 'ta': "Tamil",
486
- 'tcy': "Tulu",
487
- 'te': "Telugu",
488
- 'tet': "Tetun",
489
- 'tg': "Tajik",
490
- 'th': "Thai",
491
- 'thv': "Tahaggart Tamahaq",
492
- 'ti': "Tigrinya",
493
- 'tig': "Tigre",
494
- 'tk': "Turkmen",
495
- 'tkl': "Tokelauan",
496
- 'tl': "Tagalog",
497
- 'tlh': "Klingon",
498
- 'tly': "Talysh",
499
- 'tmr': "Jewish Babylonian Aramaic",
500
- 'tmw': "Temuan",
501
- 'tn': "Setswana",
502
- 'to': "Tongan",
503
- 'toi': "Tonga (Zambezi)",
504
- 'toki': "Toki Pona",
505
- 'tpi': "Tok Pisin",
506
- 'tpw': "Old Tupi",
507
- 'tr': "Turkish",
508
- 'ts': "Tsonga",
509
- 'tt': "Tatar",
510
- 'tts': "Isan",
511
- 'tvl': "Tuvaluan",
512
- 'ty': "Tahitian",
513
- 'tyv': "Tuvinian",
514
- 'tzl': "Talossan",
515
- 'udm': "Udmurt",
516
- 'ug': "Uyghur",
517
- 'uk': "Ukrainian",
518
- 'umb': "Umbundu",
519
- 'ur': "Urdu",
520
- 'urh': "Urhobo",
521
- 'uz': "Uzbek",
522
- 'vec': "Venetian",
523
- 'vep': "Veps",
524
- 'vi': "Vietnamese",
525
- 'vls': "Vlaams",
526
- 'vo': "Volapük",
527
- 'vro': "Võro",
528
- 'wa': "Walloon",
529
- 'war': "Waray",
530
- 'wo': "Wolof",
531
- 'wuu': "Shanghainese",
532
- 'xal': "Kalmyk",
533
- 'xh': "Xhosa",
534
- 'xmf': "Mingrelian",
535
- 'yi': "Yiddish",
536
- 'yo': "Yoruba",
537
- 'yue': "Cantonese",
538
- 'zea': "Zeeuws",
539
- 'zh': "Chinese",
540
- 'zh-classical': "zh-classical",
541
- 'zh-min-nan': "zh-min-nan",
542
- 'zh-yue': "zh-yue",
543
- 'zlm': "Malay (Vernacular)",
544
- 'zsm': "Malay",
545
- 'zu': "Zulu",
546
- 'zza': "Zaza"
547
- }
548
-
549
- language = language_map[output['language'][0].result]
550
- confidence = round(float(output['language'][0].metadata[language])*100, 2)
551
-
552
- st.markdown(f"This text is in **{language} ({output['language'][0].result})** language.")
553
- st.markdown(f"Classification Confidence: **{confidence}%**")
 
 
 
 
1
+ import streamlit as st
2
+ import sparknlp
3
+ import os
4
+ import pandas as pd
5
+
6
+ from sparknlp.base import *
7
+ from sparknlp.annotator import *
8
+ from pyspark.ml import Pipeline
9
+ from sparknlp.pretrained import PretrainedPipeline
10
+ from annotated_text import annotated_text
11
+
12
+ # Page configuration
13
+ st.set_page_config(
14
+ layout="wide",
15
+ page_title="Spark NLP Demos App",
16
+ initial_sidebar_state="auto"
17
+ )
18
+
19
+ # CSS for styling
20
+ st.markdown("""
21
+ <style>
22
+ .main-title {
23
+ font-size: 36px;
24
+ color: #4A90E2;
25
+ font-weight: bold;
26
+ text-align: center;
27
+ }
28
+ .section {
29
+ background-color: #f9f9f9;
30
+ padding: 15px;
31
+ border-radius: 10px;
32
+ margin-top: 20px;
33
+ }
34
+ .stTable {
35
+ margin-left: auto;
36
+ margin-right: auto;
37
+ }
38
+ </style>
39
+ """, unsafe_allow_html=True)
40
+
41
+ @st.cache_resource
42
+ def init_spark():
43
+ return sparknlp.start()
44
+
45
+ @st.cache_resource
46
+ def create_pipeline(model):
47
+ documentAssembler = DocumentAssembler()\
48
+ .setInputCol("text")\
49
+ .setOutputCol("document")
50
+
51
+ sentence_detector = SentenceDetector() \
52
+ .setInputCols(["document"]) \
53
+ .setOutputCol("sentence")
54
+
55
+ languageDetector = LanguageDetectorDL.pretrained(model)\
56
+ .setInputCols("sentence")\
57
+ .setOutputCol("language")\
58
+ .setThreshold(0.5)\
59
+ .setCoalesceSentences(True)
60
+
61
+ nlpPipeline = Pipeline(
62
+ stages=[
63
+ documentAssembler,
64
+ sentence_detector,
65
+ languageDetector])
66
+
67
+ return nlpPipeline
68
+
69
+ def fit_data(pipeline, data):
70
+ empty_df = spark.createDataFrame([['']]).toDF('text')
71
+ pipeline_model = pipeline.fit(empty_df)
72
+ model = LightPipeline(pipeline_model)
73
+ results = model.fullAnnotate(data)[0]
74
+
75
+ return results
76
+
77
+ # Set up the page layout
78
+ st.markdown('<div class="main-title">State-Of-The-Art Language Detection With Spark NLP</div>', unsafe_allow_html=True)
79
+ st.subheader('Support for 375 different languages')
80
+
81
+ # Sidebar content
82
+ model = st.sidebar.selectbox(
83
+ "Choose the pretrained model",
84
+ ["ld_wiki_tatoeba_cnn_375"],
85
+ help="For more info about the models visit: https://sparknlp.org/models"
86
+ )
87
+
88
+ with st.expander("View Supported Languges"):
89
+ st.write("Abkhaz, Iraqi Arabic, Adyghe, Afrikaans, Gulf Arabic, Afrihili, Assyrian Neo-Aramaic, Ainu, Aklanon, Gheg Albanian, Amharic, Aragonese, Old English, Uab Meto, North Levantine Arabic, Arabic, Algerian Arabic, Moroccan Arabic, Egyptian Arabic, Assamese, Asturian, Kotava, Awadhi, Aymara, Azerbaijani, Bashkir, Baluchi, Balinese, Bavarian, Central Bikol, Belarusian, Berber, Bulgarian, Bhojpuri, Bislama, Banjar, Bambara, Bengali, Tibetan, Breton, Bodo, Bosnian, Buryat, Baybayanon, Brithenig, Catalan, Cayuga, Chavacano, Chechen, Cebuano, Chamorro, Chagatai, Chinook Jargon, Choctaw, Cherokee, Jin Chinese, Chukchi, Central Mnong, Corsican, Chinese Pidgin English, Crimean Tatar, Seychellois Creole, Czech, Kashubian, Chuvash, Welsh, CycL, Cuyonon, Danish, German, Dungan, Drents, Lower Sorbian, Central Dusun, Dhivehi, Dutton World Speedwords, Ewe, Emilian, Greek, Erromintxela, English, Middle English, Esperanto, Spanish, Estonian, Basque, Evenki, Extremaduran, Persian, Finnish, Fijian, Kven Finnish, Faroese, French, Middle French, Old French, North Frisian, Pulaar, Friulian, Nigerian Fulfulde, Frisian, Irish, Ga, Gagauz, Gan Chinese, Garhwali, Guadeloupean Creole French, Scottish Gaelic, Gilbertese, Galician, Guarani, Konkani (Goan), Gronings, Gothic, Ancient Greek, Swiss German, Gujarati, Manx, Hausa, Hakka Chinese, Hawaiian, Ancient Hebrew, Hebrew, Hindi, Fiji Hindi, Hiligaynon, Hmong Njua (Green), Ho, Croatian, Hunsrik, Upper Sorbian, Xiang Chinese, Haitian Creole, Hungarian, Armenian, Interlingua, Iban, Indonesian, Interlingue, Igbo, Nuosu, Inuktitut, Ilocano, Ido, Icelandic, Italian, Ingrian, Japanese, Jamaican Patois, Lojban, Juhuri (Judeo-Tat), Jewish Palestinian Aramaic, Javanese, Georgian, Karakalpak, Kabyle, Kamba, Kekchi (Q'eqchi'), Khasi, Khakas, Kazakh, Greenlandic, Khmer, Kannada, Korean, Komi-Permyak, Komi-Zyrian, Karachay-Balkar, Karelian, Kashmiri, Kölsch, Kurdish, Kumyk, Cornish, Keningau Murut, Kyrgyz, Coastal Kadazan, Latin, Southern Subanen, Ladino, Luxembourgish, Láadan, Lingua Franca Nova, Luganda, Ligurian, Livonian, Lakota, Ladin, Lombard, Lingala, Lao, Louisiana Creole, Lithuanian, Latgalian, Latvian, Latvian, Literary Chinese, Laz, Madurese, Maithili, North Moluccan Malay, Moksha, Morisyen, Malagasy, Mambae, Marshallese, Meadow Mari, Maori, Mi'kmaq, Minangkabau, Macedonian, Malayalam, Mongolian, Manchu, Mon, Mohawk, Marathi, Hill Mari, Malay, Maltese, Tagal Murut, Mirandese, Hmong Daw (White), Burmese, Erzya, Nauruan, Nahuatl, Norwegian Bokmål, Central Huasteca Nahuatl, Low German (Low Saxon), Nepali, Newari, Ngeq, Guerrero Nahuatl, Niuean, Dutch, Orizaba Nahuatl, Norwegian Nynorsk, Norwegian, Nogai, Old Norse, Novial, Nepali, Naga (Tangshang), Navajo, Chinyanja, Nyungar, Old Aramaic, Occitan, Ojibwe, Odia (Oriya), Old East Slavic, Ossetian, Old Spanish, Old Saxon, Ottoman Turkish, Old Turkish, Punjabi (Eastern), Pangasinan, Kapampangan, Papiamento, Palauan, Picard, Pennsylvania German, Palatine German, Phoenician, Pali, Polish, Piedmontese, Punjabi (Western), Pipil, Old Prussian, Pashto, Portuguese, Quechua, K'iche', Quenya, Rapa Nui, Rendille, Tarifit, Romansh, Kirundi, Romanian, Romani, Russian, Rusyn, Kinyarwanda, Okinawan, Sanskrit, Yakut, Sardinian, Sicilian, Scots, Sindhi, Northern Sami, Sango, Samogitian, Shuswap, Tachawit, Sinhala, Sindarin, Slovak, Slovenian, Samoan, Southern Sami, Shona, Somali, Albanian, Serbian, Swazi, Southern Sotho, Saterland Frisian, Sundanese, Sumerian, Swedish, Swahili, Swabian, Swahili, Syriac, Tamil, Telugu, Tetun, Tajik, Thai, Tahaggart Tamahaq, Tigrinya, Tigre, Turkmen, Tokelauan, Tagalog, Klingon, Talysh, Jewish Babylonian Aramaic, Temuan, Setswana, Tongan, Tonga (Zambezi), Toki Pona, Tok Pisin, Old Tupi, Turkish, Tsonga, Tatar, Isan, Tuvaluan, Tahitian, Tuvinian, Talossan, Udmurt, Uyghur, Ukrainian, Umbundu, Urdu, Urhobo, Uzbek, Venetian, Veps, Vietnamese, Volapük, Võro, Walloon, Waray, Wolof, Shanghainese, Kalmyk, Xhosa, Mingrelian, Yiddish, Yoruba, Cantonese, Chinese, Malay (Vernacular), Malay, Zulu, and Zaza.")
90
+
91
+ # Reference notebook link in sidebar
92
+ link = """
93
+ <a href="https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/Language_Detector.ipynb">
94
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
95
+ </a>
96
+ """
97
+ st.sidebar.markdown('Reference notebook:')
98
+ st.sidebar.markdown(link, unsafe_allow_html=True)
99
+
100
+ # Load examples
101
+ folder_path = f"inputs/{model}"
102
+ examples = [
103
+ lines[1].strip()
104
+ for filename in os.listdir(folder_path)
105
+ if filename.endswith('.txt')
106
+ for lines in [open(os.path.join(folder_path, filename), 'r', encoding='utf-8').readlines()]
107
+ if len(lines) >= 2
108
+ ]
109
+
110
+ selected_text = st.selectbox("Select a sample text", examples)
111
+ custom_input = st.text_input("Try it for yourself!")
112
+
113
+ if custom_input:
114
+ selected_text = custom_input
115
+ elif selected_text:
116
+ selected_text = selected_text
117
+
118
+ st.subheader('Selected Text')
119
+ st.markdown(f"""<div class="section">{selected_text}</div>""", unsafe_allow_html=True)
120
+
121
+ # Initialize Spark and create pipeline
122
+ spark = init_spark()
123
+ pipeline = create_pipeline(model)
124
+ output = fit_data(pipeline, selected_text)
125
+
126
+ # Display output
127
+ language_map = {
128
+ 'ab': "Abkhaz",
129
+ 'ace': "Achinese",
130
+ 'acm': "Iraqi Arabic",
131
+ 'ady': "Adyghe",
132
+ 'af': "Afrikaans",
133
+ 'afb': "Gulf Arabic",
134
+ 'afh': "Afrihili",
135
+ 'aii': "Assyrian Neo-Aramaic",
136
+ 'ain': "Ainu",
137
+ 'akl': "Aklanon",
138
+ 'aln': "Gheg Albanian",
139
+ 'als': "Tosk Albanian",
140
+ 'am': "Amharic",
141
+ 'an': "Aragonese",
142
+ 'ang': "Old English",
143
+ 'aoz': "Uab Meto",
144
+ 'apc': "North Levantine Arabic",
145
+ 'ar': "Arabic",
146
+ 'arq': "Algerian Arabic",
147
+ 'ary': "Moroccan Arabic",
148
+ 'arz': "Egyptian Arabic",
149
+ 'as': "Assamese",
150
+ 'ast': "Asturian",
151
+ 'av': "Avaric",
152
+ 'avk': "Kotava",
153
+ 'awa': "Awadhi",
154
+ 'ay': "Aymara",
155
+ 'az': "Azerbaijani",
156
+ 'azb': "South Azerbaijani",
157
+ 'ba': "Bashkir",
158
+ 'bal': "Baluchi",
159
+ 'ban': "Balinese",
160
+ 'bar': "Bavarian",
161
+ 'bat-smg': "bat-smg",
162
+ 'bcl': "Central Bikol",
163
+ 'be': "Belarusian",
164
+ 'ber': "Berber",
165
+ 'bg': "Bulgarian",
166
+ 'bh': "bh",
167
+ 'bho': "Bhojpuri",
168
+ 'bi': "Bislama",
169
+ 'bjn': "Banjar",
170
+ 'bm': "Bambara",
171
+ 'bn': "Bengali",
172
+ 'bo': "Tibetan",
173
+ 'bpy': "Bishnupriya",
174
+ 'br': "Breton",
175
+ 'brx': "Bodo",
176
+ 'bs': "Bosnian",
177
+ 'bua': "Buryat",
178
+ 'bvy': "Baybayanon",
179
+ 'bxr': "Russia Buriat",
180
+ 'bzt': "Brithenig",
181
+ 'ca': "Catalan",
182
+ 'cay': "Cayuga",
183
+ 'cbk': "Chavacano",
184
+ 'cbk-zam': "cbk-zam",
185
+ 'cdo': "Min Dong Chinese",
186
+ 'ce': "Chechen",
187
+ 'ceb': "Cebuano",
188
+ 'ch': "Chamorro",
189
+ 'chg': "Chagatai",
190
+ 'chn': "Chinook Jargon",
191
+ 'cho': "Choctaw",
192
+ 'chr': "Cherokee",
193
+ 'cjy': "Jin Chinese",
194
+ 'ckb': "Central Kurdish (Soranî)",
195
+ 'ckt': "Chukchi",
196
+ 'cmo': "Central Mnong",
197
+ 'co': "Corsican",
198
+ 'cpi': "Chinese Pidgin English",
199
+ 'crh': "Crimean Tatar",
200
+ 'crs': "Seychellois Creole",
201
+ 'cs': "Czech",
202
+ 'ces': "Czech",
203
+ 'csb': "Kashubian",
204
+ 'cv': "Chuvash",
205
+ 'cy': "Welsh",
206
+ 'cycl': "CycL",
207
+ 'cyo': "Cuyonon",
208
+ 'da': "Danish",
209
+ 'de': "German",
210
+ 'deu': "German",
211
+ 'diq': "Dimli (individual language)",
212
+ 'dng': "Dungan",
213
+ 'drt': "Drents",
214
+ 'dsb': "Lower Sorbian",
215
+ 'dtp': "Central Dusun",
216
+ 'dty': "dty",
217
+ 'dv': "Dhivehi",
218
+ 'dws': "Dutton World Speedwords",
219
+ 'ee': "Ewe",
220
+ 'egl': "Emilian",
221
+ 'el': "Greek",
222
+ 'ell': "Greek",
223
+ 'eml': "eml",
224
+ 'emx': "Erromintxela",
225
+ 'en': "English",
226
+ 'enm': "Middle English",
227
+ 'eo': "Esperanto",
228
+ 'es': "Spanish",
229
+ 'et': "Estonian",
230
+ 'eu': "Basque",
231
+ 'evn': "Evenki",
232
+ 'ext': "Extremaduran",
233
+ 'fa': "Persian",
234
+ 'fi': "Finnish",
235
+ 'fiu-vro': "fiu-vro",
236
+ 'fj': "Fijian",
237
+ 'fkv': "Kven Finnish",
238
+ 'fo': "Faroese",
239
+ 'fr': "French",
240
+ 'fra': "French",
241
+ 'frm': "Middle French",
242
+ 'fro': "Old French",
243
+ 'frp': "Arpitan",
244
+ 'frr': "North Frisian",
245
+ 'fuc': "Pulaar",
246
+ 'fur': "Friulian",
247
+ 'fuv': "Nigerian Fulfulde",
248
+ 'fy': "Frisian",
249
+ 'ga': "Irish",
250
+ 'gaa': "Ga",
251
+ 'gag': "Gagauz",
252
+ 'gan': "Gan Chinese",
253
+ 'gbm': "Garhwali",
254
+ 'gcf': "Guadeloupean Creole French",
255
+ 'gd': "Scottish Gaelic",
256
+ 'gil': "Gilbertese",
257
+ 'gl': "Galician",
258
+ 'glk': "Gilaki",
259
+ 'gn': "Guarani",
260
+ 'gom': "Konkani (Goan)",
261
+ 'gos': "Gronings",
262
+ 'got': "Gothic",
263
+ 'grc': "Ancient Greek",
264
+ 'gsw': "Swiss German",
265
+ 'gu': "Gujarati",
266
+ 'gv': "Manx",
267
+ 'ha': "Hausa",
268
+ 'hak': "Hakka Chinese",
269
+ 'haw': "Hawaiian",
270
+ 'hbo': "Ancient Hebrew",
271
+ 'he': "Hebrew",
272
+ 'hi': "Hindi",
273
+ 'hif': "Fiji Hindi",
274
+ 'hil': "Hiligaynon",
275
+ 'hnj': "Hmong Njua (Green)",
276
+ 'hoc': "Ho",
277
+ 'hr': "Croatian",
278
+ 'hrx': "Hunsrik",
279
+ 'hsb': "Upper Sorbian",
280
+ 'hsn': "Xiang Chinese",
281
+ 'ht': "Haitian Creole",
282
+ 'hu': "Hungarian",
283
+ 'hy': "Armenian",
284
+ 'ia': "Interlingua",
285
+ 'iba': "Iban",
286
+ 'id': "Indonesian",
287
+ 'ie': "Interlingue",
288
+ 'ig': "Igbo",
289
+ 'ii': "Nuosu",
290
+ 'ike': "Inuktitut",
291
+ 'ilo': "Ilocano",
292
+ 'io': "Ido",
293
+ 'is': "Icelandic",
294
+ 'it': "Italian",
295
+ 'izh': "Ingrian",
296
+ 'ja': "Japanese",
297
+ 'jam': "Jamaican Patois",
298
+ 'jbo': "Lojban",
299
+ 'jdt': "Juhuri (Judeo-Tat)",
300
+ 'jpa': "Jewish Palestinian Aramaic",
301
+ 'jv': "Javanese",
302
+ 'ka': "Georgian",
303
+ 'kaa': "Karakalpak",
304
+ 'kab': "Kabyle",
305
+ 'kam': "Kamba",
306
+ 'kbd': "Kabardian",
307
+ 'kek': "Kekchi (Q'eqchi')",
308
+ 'kha': "Khasi",
309
+ 'kjh': "Khakas",
310
+ 'kk': "Kazakh",
311
+ 'kl': "Greenlandic",
312
+ 'km': "Khmer",
313
+ 'kn': "Kannada",
314
+ 'ko': "Korean",
315
+ 'koi': "Komi-Permyak",
316
+ 'kpv': "Komi-Zyrian",
317
+ 'krc': "Karachay-Balkar",
318
+ 'krl': "Karelian",
319
+ 'ks': "Kashmiri",
320
+ 'ksh': "Kölsch",
321
+ 'ku': "Kurdish",
322
+ 'kum': "Kumyk",
323
+ 'kv': "Komi",
324
+ 'kw': "Cornish",
325
+ 'kxi': "Keningau Murut",
326
+ 'ky': "Kyrgyz",
327
+ 'kzj': "Coastal Kadazan",
328
+ 'la': "Latin",
329
+ 'laa': "Southern Subanen",
330
+ 'lad': "Ladino",
331
+ 'lb': "Luxembourgish",
332
+ 'ldn': "Láadan",
333
+ 'lez': "Lezghian",
334
+ 'lfn': "Lingua Franca Nova",
335
+ 'lg': "Luganda",
336
+ 'li': "Limburgan",
337
+ 'lij': "Ligurian",
338
+ 'liv': "Livonian",
339
+ 'lkt': "Lakota",
340
+ 'lld': "Ladin",
341
+ 'lmo': "Lombard",
342
+ 'ln': "Lingala",
343
+ 'lo': "Lao",
344
+ 'lou': "Louisiana Creole",
345
+ 'lrc': "Northern Luri",
346
+ 'lt': "Lithuanian",
347
+ 'ltg': "Latgalian",
348
+ 'lv': "Latvian",
349
+ 'lvs': "Latvian",
350
+ 'lzh': "Literary Chinese",
351
+ 'lzz': "Laz",
352
+ 'mad': "Madurese",
353
+ 'mai': "Maithili",
354
+ 'map-bms': "map-bms",
355
+ 'max': "North Moluccan Malay",
356
+ 'mdf': "Moksha",
357
+ 'mfe': "Morisyen",
358
+ 'mg': "Malagasy",
359
+ 'mgm': "Mambae",
360
+ 'mh': "Marshallese",
361
+ 'mhr': "Meadow Mari",
362
+ 'mi': "Maori",
363
+ 'mic': "Mi'kmaq",
364
+ 'min': "Minangkabau",
365
+ 'mk': "Macedonian",
366
+ 'ml': "Malayalam",
367
+ 'mn': "Mongolian",
368
+ 'mnc': "Manchu",
369
+ 'mnw': "Mon",
370
+ 'moh': "Mohawk",
371
+ 'mr': "Marathi",
372
+ 'mrj': "Hill Mari",
373
+ 'ms': "Malay",
374
+ 'mt': "Maltese",
375
+ 'mvv': "Tagal Murut",
376
+ 'mwl': "Mirandese",
377
+ 'mww': "Hmong Daw (White)",
378
+ 'my': "Burmese",
379
+ 'myv': "Erzya",
380
+ 'mzn': "Mazanderani",
381
+ 'na': "Nauruan",
382
+ 'nah': "Nahuatl",
383
+ 'nap': "Neapolitan",
384
+ 'nb': "Norwegian Bokmål",
385
+ 'nch': "Central Huasteca Nahuatl",
386
+ 'nds': "Low German (Low Saxon)",
387
+ 'nds-nl': "nds-nl",
388
+ 'ne': "Nepali",
389
+ 'new': "Newari",
390
+ 'ngt': "Ngeq",
391
+ 'ngu': "Guerrero Nahuatl",
392
+ 'niu': "Niuean",
393
+ 'nl': "Dutch",
394
+ 'nlv': "Orizaba Nahuatl",
395
+ 'nn': "Norwegian Nynorsk",
396
+ 'no': "Norwegian",
397
+ 'nog': "Nogai",
398
+ 'non': "Old Norse",
399
+ 'nov': "Novial",
400
+ 'npi': "Nepali",
401
+ 'nrm': "Narom",
402
+ 'nso': "Pedi",
403
+ 'nst': "Naga (Tangshang)",
404
+ 'nv': "Navajo",
405
+ 'ny': "Chinyanja",
406
+ 'nys': "Nyungar",
407
+ 'oar': "Old Aramaic",
408
+ 'oc': "Occitan",
409
+ 'oj': "Ojibwe",
410
+ 'olo': "Livvi",
411
+ 'om': "Oromo",
412
+ 'or': "Odia (Oriya)",
413
+ 'orv': "Old East Slavic",
414
+ 'os': "Ossetian",
415
+ 'osp': "Old Spanish",
416
+ 'osx': "Old Saxon",
417
+ 'ota': "Ottoman Turkish",
418
+ 'otk': "Old Turkish",
419
+ 'pa': "Punjabi (Eastern)",
420
+ 'pag': "Pangasinan",
421
+ 'pam': "Kapampangan",
422
+ 'pap': "Papiamento",
423
+ 'pau': "Palauan",
424
+ 'pcd': "Picard",
425
+ 'pdc': "Pennsylvania German",
426
+ 'pfl': "Palatine German",
427
+ 'phn': "Phoenician",
428
+ 'pi': "Pali",
429
+ 'pl': "Polish",
430
+ 'pms': "Piedmontese",
431
+ 'pnb': "Punjabi (Western)",
432
+ 'ppl': "Pipil",
433
+ 'prg': "Old Prussian",
434
+ 'ps': "Pashto",
435
+ 'pt': "Portuguese",
436
+ 'qu': "Quechua",
437
+ 'quc': "K'iche'",
438
+ 'qya': "Quenya",
439
+ 'rap': "Rapa Nui",
440
+ 'rel': "Rendille",
441
+ 'rif': "Tarifit",
442
+ 'rm': "Romansh",
443
+ 'rn': "Kirundi",
444
+ 'ro': "Romanian",
445
+ 'ron': "Romanian",
446
+ 'roa-rup': "roa-rup",
447
+ 'roa-tara': "roa-tara",
448
+ 'rom': "Romani",
449
+ 'ru': "Russian",
450
+ 'rue': "Rusyn",
451
+ 'rw': "Kinyarwanda",
452
+ 'ryu': "Okinawan",
453
+ 'sa': "Sanskrit",
454
+ 'sah': "Yakut",
455
+ 'sc': "Sardinian",
456
+ 'scn': "Sicilian",
457
+ 'sco': "Scots",
458
+ 'sd': "Sindhi",
459
+ 'se': "Northern Sami",
460
+ 'sg': "Sango",
461
+ 'sgs': "Samogitian",
462
+ 'sh': "Serbo-Croatian",
463
+ 'shs': "Shuswap",
464
+ 'shy': "Tachawit",
465
+ 'si': "Sinhala",
466
+ 'sjn': "Sindarin",
467
+ 'sk': "Slovak",
468
+ 'slk': "Slovak",
469
+ 'sl': "Slovenian",
470
+ 'sm': "Samoan",
471
+ 'sma': "Southern Sami",
472
+ 'sn': "Shona",
473
+ 'so': "Somali",
474
+ 'sq': "Albanian",
475
+ 'sr': "Serbian",
476
+ 'srn': "Sranan Tongo",
477
+ 'ss': "Swazi",
478
+ 'st': "Southern Sotho",
479
+ 'stq': "Saterland Frisian",
480
+ 'su': "Sundanese",
481
+ 'sux': "Sumerian",
482
+ 'sv': "Swedish",
483
+ 'sw': "Swahili",
484
+ 'swg': "Swabian",
485
+ 'swh': "Swahili",
486
+ 'syc': "Syriac",
487
+ 'szl': "Silesian",
488
+ 'ta': "Tamil",
489
+ 'tcy': "Tulu",
490
+ 'te': "Telugu",
491
+ 'tet': "Tetun",
492
+ 'tg': "Tajik",
493
+ 'th': "Thai",
494
+ 'thv': "Tahaggart Tamahaq",
495
+ 'ti': "Tigrinya",
496
+ 'tig': "Tigre",
497
+ 'tk': "Turkmen",
498
+ 'tkl': "Tokelauan",
499
+ 'tl': "Tagalog",
500
+ 'tlh': "Klingon",
501
+ 'tly': "Talysh",
502
+ 'tmr': "Jewish Babylonian Aramaic",
503
+ 'tmw': "Temuan",
504
+ 'tn': "Setswana",
505
+ 'to': "Tongan",
506
+ 'toi': "Tonga (Zambezi)",
507
+ 'toki': "Toki Pona",
508
+ 'tpi': "Tok Pisin",
509
+ 'tpw': "Old Tupi",
510
+ 'tr': "Turkish",
511
+ 'ts': "Tsonga",
512
+ 'tt': "Tatar",
513
+ 'tts': "Isan",
514
+ 'tvl': "Tuvaluan",
515
+ 'ty': "Tahitian",
516
+ 'tyv': "Tuvinian",
517
+ 'tzl': "Talossan",
518
+ 'udm': "Udmurt",
519
+ 'ug': "Uyghur",
520
+ 'uk': "Ukrainian",
521
+ 'umb': "Umbundu",
522
+ 'ur': "Urdu",
523
+ 'urh': "Urhobo",
524
+ 'uz': "Uzbek",
525
+ 'vec': "Venetian",
526
+ 'vep': "Veps",
527
+ 'vi': "Vietnamese",
528
+ 'vls': "Vlaams",
529
+ 'vo': "Volapük",
530
+ 'vro': "Võro",
531
+ 'wa': "Walloon",
532
+ 'war': "Waray",
533
+ 'wo': "Wolof",
534
+ 'wuu': "Shanghainese",
535
+ 'xal': "Kalmyk",
536
+ 'xh': "Xhosa",
537
+ 'xmf': "Mingrelian",
538
+ 'yi': "Yiddish",
539
+ 'yo': "Yoruba",
540
+ 'yue': "Cantonese",
541
+ 'zea': "Zeeuws",
542
+ 'zh': "Chinese",
543
+ 'zh-classical': "zh-classical",
544
+ 'zh-min-nan': "zh-min-nan",
545
+ 'zh-yue': "zh-yue",
546
+ 'zlm': "Malay (Vernacular)",
547
+ 'zsm': "Malay",
548
+ 'zu': "Zulu",
549
+ 'zza': "Zaza"
550
+ }
551
+
552
+ language = language_map[output['language'][0].result]
553
+ confidence = round(float(output['language'][0].metadata[language])*100, 2)
554
+
555
+ st.markdown(f"This text is in **{language} ({output['language'][0].result})** language.")
556
+ st.markdown(f"Classification Confidence: **{confidence}%**")