kargaranamir commited on
Commit
c9a6574
·
0 Parent(s):

add ava, bqi, lki, lrc, bar, bal.

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: GlotWeb
3
+ emoji: 🕸
4
+ colorFrom: pink
5
+ colorTo: red
6
+ sdk: streamlit
7
+ sdk_version: 1.33.0
8
+ app_file: app.py
9
+ pinned: True
10
+ license: mit
11
+ ---
12
+
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ from utils import nav_to, df_to_html, render_svg, combine_json_files
4
+
5
+ data = combine_json_files('./languages')
6
+
7
+
8
+ @st.cache_data
9
+ def render_home_table():
10
+ """Renders home table."""
11
+ # Compute number of unique domains/urls
12
+ for key in data.keys():
13
+ data[key]['Number of Sites'] = len(data[key].get('Sites', []))
14
+ data[key]["Number of Links"] = sum(len(url_data["Links"]) for url_data in data[key].get('Sites', []))
15
+
16
+ # Convert dict to df
17
+ df_data = pd.DataFrame(data).transpose()
18
+ df_data['ISO Code'] = df_data.index
19
+
20
+ # Sort by ISO Code
21
+ df_data = df_data.sort_values(by='ISO Code')
22
+
23
+ df_data['Number of Sites'] = df_data['Number of Sites'].astype(str) # Convert to string
24
+ df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
25
+ df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
26
+ df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
27
+
28
+
29
+ # Display the table
30
+ df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Supported by allenai/MADLAD-400 or facebook/flores']]
31
+ st.write(df_to_html(df_data), unsafe_allow_html=True)
32
+
33
+
34
+ @st.cache_data
35
+ def render_site_table(isocode):
36
+
37
+ # back
38
+ back_text = '<a href="/?home=True" target="_self">Back</a>'
39
+ st.markdown(back_text, unsafe_allow_html=True)
40
+
41
+ st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode})
42
+
43
+ # site
44
+ urls = data[isocode].get('Sites', [])
45
+ df_urls = pd.DataFrame(urls)
46
+ df_urls['Number of Links'] = df_urls['Links'].apply(len)
47
+ df_urls = df_urls.sort_values(by='Number of Links', ascending=False)
48
+ df_urls = df_urls.reset_index(drop=True)
49
+ df_urls['Number of Links'] = df_urls.apply(lambda row: '<a href="/?isocode={}&siteurl={}" target="_self">{}</a>'.format(isocode, row['Site URL'], row['Number of Links']) if row['Number of Links'] else row['Number of Links'], axis=1)
50
+
51
+ # Display the table
52
+ df_urls = df_urls[['Site Name', 'Site URL', 'Category', 'Number of Links', 'Possible Parallel Languages', 'Confidence', 'Info']]
53
+ st.write(df_to_html(df_urls), unsafe_allow_html=True)
54
+
55
+
56
+ @st.cache_data
57
+ def render_siteurl_table(isocode, url):
58
+
59
+ # back
60
+ back_text = '<a href="/?isocode={}&site=True" target="_self">Back</a>'.format(isocode)
61
+ st.markdown(back_text, unsafe_allow_html=True)
62
+
63
+ # Find selected domain
64
+ urls = data[isocode].get('Sites', [])
65
+ selected_domain = next((d for d in urls if 'Site URL' in d and d['Site URL'] == url), None)
66
+
67
+ if selected_domain:
68
+ st.write({'Language Name': data[isocode]['Language Name'], 'ISO Code': isocode, 'Site URL': url, 'Links': selected_domain['Links']})
69
+
70
+
71
+
72
+ @st.cache_data
73
+ def render_links_table(isocode):
74
+
75
+ # back
76
+ back_text = '<a href="/?home=True" target="_self">Back</a>'
77
+ st.markdown(back_text, unsafe_allow_html=True)
78
+
79
+ # output
80
+ urls = data[isocode].get('Sites', [])
81
+ lang_name = data[isocode]['Language Name']
82
+ all_urls = [{'Site URL': du['Site URL'], 'Links': du['Links']} for du in urls]
83
+
84
+ st.write({'Language Name': lang_name, 'ISO Code': isocode, 'URLs': all_urls})
85
+
86
+
87
+
88
+ # show logo
89
+ render_svg(open("assets/glotweb_logo.svg").read())
90
+ st.text("")
91
+
92
+
93
+ def main():
94
+ params = st.query_params
95
+
96
+ if 'isocode' in params:
97
+ if 'siteurl' in params:
98
+ render_siteurl_table(params['isocode'], params['siteurl'])
99
+ if 'site' in params:
100
+ render_site_table(params['isocode'])
101
+ if 'links' in params:
102
+ render_links_table(params['isocode'])
103
+
104
+ elif 'home' not in params:
105
+ # redirect to .space
106
+ nav_to('https://cis-lmu-glotweb.hf.space/?home=True')
107
+ else:
108
+ # show home
109
+ render_home_table()
110
+
111
+ main()
assets/glotweb_logo.svg ADDED
languages/ava_Cyrl.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Avaric",
3
+ "Family": "Nakh-Daghestanian",
4
+ "Subgrouping": "Avar–Andic",
5
+ "Number of Speakers": "800_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟨",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "radioerkenli.com",
10
+ "Site URL": "https://www.radioerkenli.com/",
11
+ "Category": "news",
12
+ "Confidence": "🟩",
13
+ "Info": "confirmed by glotlid, some webpage annotation also backed it up.",
14
+ "Possible Parallel Languages": "",
15
+ "Links": []
16
+ }
17
+ ]
18
+ }
languages/bal_Arab.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Balochi",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Iranian",
5
+ "Number of Speakers": "8_000_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟥",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "sunnionline.us",
10
+ "Site URL": "https://sunnionline.us/balochi/",
11
+ "Category": "news",
12
+ "Confidence": "🟩",
13
+ "Info": "confirmed by webpage metadata",
14
+ "Possible Parallel Languages": "eng_Latn, fas_Arab, urd_Arab, arb_Arab, rus_Cyrl",
15
+ "Links": []
16
+ },
17
+ {
18
+ "Site Name": "kissah.org",
19
+ "Site URL": "https://kissah.org/",
20
+ "Category": "literature",
21
+ "Confidence": "🟩",
22
+ "Info": "confirmed by native speakers approval",
23
+ "Possible Parallel Languages": "",
24
+ "Links": []
25
+ },
26
+ {
27
+ "Site Name": "baask.com",
28
+ "Site URL": "https://baask.com/archive/category/balochi/",
29
+ "Category": "literature",
30
+ "Confidence": "🟩",
31
+ "Info": "confirmed by webpage metadata",
32
+ "Possible Parallel Languages": "",
33
+ "Links": []
34
+ },
35
+ {
36
+ "Site Name": "facebook.com/BaaskDotCom",
37
+ "Site URL": "https://www.facebook.com/BaaskDotCom",
38
+ "Category": "literature, social media",
39
+ "Confidence": "🟩",
40
+ "Info": "confirmed by webpage metadata",
41
+ "Possible Parallel Languages": "",
42
+ "Links": []
43
+ }
44
+ ]
45
+ }
languages/bar_Latn.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Bavarian",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Germanic",
5
+ "Number of Speakers": "14_000_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟥",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "bar.wikipedia.org",
10
+ "Site URL": "https://bar.wikipedia.org/wiki/",
11
+ "Category": "article",
12
+ "Confidence": "🟩",
13
+ "Info": "confirmed by webpage metadata",
14
+ "Possible Parallel Languages": "many",
15
+ "Links": []
16
+ },
17
+ {
18
+ "Site Name": "twitter.com/bayernuhr",
19
+ "Site URL": "https://twitter.com/bayernuhr",
20
+ "Category": "social media",
21
+ "Confidence": "🟩",
22
+ "Info": "glotlid search on http://indigenoustweets.com/bar/",
23
+ "Possible Parallel Languages": "",
24
+ "Links": []
25
+ }
26
+ ]
27
+ }
languages/bqi_Arab.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Bakhtiari",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Iranian",
5
+ "Number of Speakers": "1_200_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟥",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "lurishacademy.org",
10
+ "Site URL": "https://lurishacademy.org/",
11
+ "Category": "literature",
12
+ "Confidence": "🟩",
13
+ "Info": "native speaker confirmation with respect to the webpage annotation",
14
+ "Possible Parallel Languages": "",
15
+ "Links": ["https://lurishacademy.org/articles/داستان-ل-وری-پؽا-ۉ-پٱری-داستان-لری-مرد-و-پری",
16
+ "https://lurishacademy.org/articles/دؽاری-کردن-آمٱئمتقتٱقی-خوݩ-چالٱنگ"]
17
+ }
18
+ ]
19
+ }
languages/lki_Arab.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Laki",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Iranian",
5
+ "Number of Speakers": "600_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟥",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "lurishacademy.org",
10
+ "Site URL": "https://lurishacademy.org/",
11
+ "Category": "literature",
12
+ "Confidence": "🟩",
13
+ "Info": "native speaker confirmation with respect to the webpage annotation and confirmed by glotlid",
14
+ "Possible Parallel Languages": "",
15
+ "Links": ["https://lurishacademy.org/articles/شعر-شیرین-ترازیا-سروده-نجف-آزادبخت"]
16
+ }
17
+ ]
18
+ }
languages/lrc_Arab.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Language Name": "Northern Luri",
3
+ "Family": "Indo-European",
4
+ "Subgrouping": "Iranian",
5
+ "Number of Speakers": "4_000_000",
6
+ "Supported by allenai/MADLAD-400 or facebook/flores": "🟨",
7
+ "Sites": [
8
+ {
9
+ "Site Name": "lurishacademy.org",
10
+ "Site URL": "https://lurishacademy.org/",
11
+ "Category": "news",
12
+ "Confidence": "🟩",
13
+ "Info": "native speaker confirmation with respect to the webpage annotation",
14
+ "Possible Parallel Languages": "",
15
+ "Links": ["https://lurishacademy.org/articles/خٱلک-ل-ۏر",
16
+ "https://lurishacademy.org/articles/ڤیرشناسی",
17
+ "https://lurishacademy.org/articles/یٱهۊدیٱت",
18
+ "https://lurishacademy.org/articles/کومرٱ-ڤلات-کولومبیا"]
19
+ }
20
+ ]
21
+ }
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ tabulate
utils.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import base64
5
+ import json
6
+
7
+ # navigate to url
8
+ def nav_to(url):
9
+ nav_script = """
10
+ <meta http-equiv="refresh" content="0; url='%s'">
11
+ """ % (url)
12
+ st.write(nav_script, unsafe_allow_html=True)
13
+
14
+
15
+ @st.cache_data
16
+ def df_to_html(df):
17
+
18
+ df = df.fillna("")
19
+
20
+ # Define table styling
21
+ styles = [
22
+ {'selector': 'tr', 'props': [('border', 'none')]}, # Hide row borders
23
+ {'selector': 'td, th', 'props': [('border', 'none'), ("text-align", "center"), ('font-size', 'smaller')]}, # Remove cell borders, reduce font size
24
+ {'selector': 'tr:hover', 'props': [('background-color', '#f5f5f5')]},
25
+ {'selector': 'a:hover', 'props': [('color', 'darkblue')]},
26
+ {'selector': 'table', 'props': [('border-collapse', 'collapse'), ('border', 'none'), ('border-bottom', '1px solid black'), ('width', '50%')]}, # Set table width to 50%
27
+ {'selector': 'thead', 'props': [('border', 'none')]}, # Hide header border
28
+ {'selector': 'tbody td', 'props': [('border-left', 'none'), ('border-right', 'none')]},
29
+ {'selector': 'tr:not(:first-child) td', 'props': [('border-left', 'none'), ('border-right', 'none'), ('border-top', 'none')]},
30
+ {'selector': 'table', 'props': [('table-layout', 'fixed')]}, # Prevent overflow
31
+ ]
32
+
33
+ # Apply table styles and convert DataFrame to HTML
34
+ styled_html = df.style.hide(axis="index").set_table_styles(styles).to_html(escape=False, index=False, bold_rows=True, justify='center').replace('<td>', '<td align="center">')
35
+
36
+ return styled_html
37
+
38
+
39
+
40
+
41
+ @st.cache_data
42
+ def render_svg(svg):
43
+ """Renders the given svg string."""
44
+ b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
45
+ html = rf'<p align="center"> <img src="data:image/svg+xml;base64,{b64}", width="40%"/> </p>'
46
+ c = st.container()
47
+ c.write(html, unsafe_allow_html=True)
48
+
49
+
50
+
51
+ @st.cache_resource
52
+ def combine_json_files(folder_path):
53
+ combined_data = {}
54
+
55
+ # Iterate through each file in the folder
56
+ for filename in os.listdir(folder_path):
57
+ file_path = os.path.join(folder_path, filename)
58
+
59
+ # Check if the file is a JSON file
60
+ if filename.endswith('.json'):
61
+ with open(file_path, 'r') as file:
62
+ # Load JSON data from the file
63
+ data = {filename.replace('.json', ''): json.load(file)}
64
+
65
+ # Merge the loaded data into the combined_data dictionary
66
+ combined_data.update(data)
67
+
68
+ return combined_data
69
+
70
+