Commit
·
e15c6b0
1
Parent(s):
4cea18a
add cas, lzz, anp and edit misc.
Browse files- app.py +5 -3
- languages/anp_Deva.json +20 -0
- languages/cas_Latn.json +23 -0
- languages/hac_Arab.json +1 -1
- languages/lzz_Latn.json +21 -0
- languages/snk_Latn.json +1 -1
- languages/tet_Latn.json +3 -3
app.py
CHANGED
@@ -24,10 +24,10 @@ def render_home_table():
|
|
24 |
df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
|
25 |
df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
|
26 |
df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
|
27 |
-
df_data["Supported by
|
28 |
|
29 |
# Display the table
|
30 |
-
df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Supported by
|
31 |
st.write(df_to_html(df_data), unsafe_allow_html=True)
|
32 |
|
33 |
|
@@ -103,6 +103,8 @@ def main():
|
|
103 |
else:
|
104 |
# show home
|
105 |
render_metadata()
|
106 |
-
st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web
|
107 |
render_home_table()
|
|
|
|
|
108 |
main()
|
|
|
24 |
df_data['ISO Code'] = df_data['ISO Code'].astype(str) # Convert to string
|
25 |
df_data['Number of Sites'] = df_data.apply(lambda row: '<a href="/?isocode={}&site=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Sites']), axis=1)
|
26 |
df_data['Number of Links'] = df_data.apply(lambda row: '<a href="/?isocode={}&links=True" target="_self">{}</a>'.format(row['ISO Code'], row['Number of Links']), axis=1)
|
27 |
+
df_data["Supported by MADLAD400 & FLORES & GLOT500"] = df_data.apply(lambda row: color_mapping([row["Supported by allenai/MADLAD-400"] + row["Supported by facebook/flores"] + row["Supported by cis-lmu/Glot500"]]), axis =1)
|
28 |
|
29 |
# Display the table
|
30 |
+
df_data = df_data[['ISO Code', 'Language Name', 'Family', 'Subgrouping', 'Number of Sites', 'Number of Links', 'Number of Speakers', 'Supported by MADLAD400 & FLORES & GLOT500']]
|
31 |
st.write(df_to_html(df_data), unsafe_allow_html=True)
|
32 |
|
33 |
|
|
|
103 |
else:
|
104 |
# show home
|
105 |
render_metadata()
|
106 |
+
st.markdown("**GlotWeb** is an indexing service for low-resource languages. It indexes **non-religous** sites or links written in each language. This list can be used to create raw text or parallel corpora and to study low-resource languages on the web.\n")
|
107 |
render_home_table()
|
108 |
+
st.markdown("\n\nWe compare the level of support for these languages in the three big datasets ([MADLAD-400](https://huggingface.co/datasets/allenai/MADLAD-400), [FLORES200](https://huggingface.co/datasets/facebook/flores), [GLOT500](https://huggingface.co/datasets/cis-lmu/Glot500)) of low-resource languages (🟥 0/3 < 🟧 1/3 < 🟨 2/3 < 🟩 3/3). Although the support in these datasets for some of these languages could be just the religious texts.", unsafe_allow_html=True)
|
109 |
+
|
110 |
main()
|
languages/anp_Deva.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Language Name": "Angika",
|
3 |
+
"Family": "Indo-European",
|
4 |
+
"Subgrouping": "Eastern Indo-Aryan",
|
5 |
+
"Number of Speakers": "15_000_000",
|
6 |
+
"Supported by allenai/MADLAD-400": 1,
|
7 |
+
"Supported by facebook/flores": 0,
|
8 |
+
"Supported by cis-lmu/Glot500": 0,
|
9 |
+
"Sites": [
|
10 |
+
{
|
11 |
+
"Site Name": "angika.com",
|
12 |
+
"Site URL": "https://www.angika.com/#angika",
|
13 |
+
"Category": "blog",
|
14 |
+
"Confidence": "🟩",
|
15 |
+
"Info": "confirmed by glotlid and webpage metadata.",
|
16 |
+
"Possible Parallel Languages": "eng_Latn, hin_Deva",
|
17 |
+
"Links": []
|
18 |
+
}
|
19 |
+
]
|
20 |
+
}
|
languages/cas_Latn.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Language Name": "Tsimané",
|
3 |
+
"Family": "Mosetén-Chimané",
|
4 |
+
"Subgrouping": "",
|
5 |
+
"Number of Speakers": "5_300",
|
6 |
+
"Supported by allenai/MADLAD-400": 0,
|
7 |
+
"Supported by facebook/flores": 0,
|
8 |
+
"Supported by cis-lmu/Glot500": 0,
|
9 |
+
"Sites": [
|
10 |
+
{
|
11 |
+
"Site Name": "tsimanelinguisticouniverso.wordpress.com",
|
12 |
+
"Site URL": "https://tsimanelinguisticouniverso.wordpress.com/",
|
13 |
+
"Category": "blog",
|
14 |
+
"Confidence": "🟩",
|
15 |
+
"Info": "confirmed by glotlid and webpage metadata - some posts have spanish translation.",
|
16 |
+
"Possible Parallel Languages": "spa_Latn",
|
17 |
+
"Links": ["https://tsimanelinguisticouniverso.wordpress.com/2015/07/28/49/",
|
18 |
+
"https://tsimanelinguisticouniverso.wordpress.com/2015/07/28/conozcamos-la-lengua-tsimane-no-1/",
|
19 |
+
"https://tsimanelinguisticouniverso.wordpress.com/2015/05/07/jun-chuc-carijtacdye-yu/",
|
20 |
+
"https://tsimanelinguisticouniverso.wordpress.com/2015/02/17/patuju/"]
|
21 |
+
}
|
22 |
+
]
|
23 |
+
}
|
languages/hac_Arab.json
CHANGED
@@ -8,7 +8,7 @@
|
|
8 |
"Supported by cis-lmu/Glot500": 0,
|
9 |
"Sites": [
|
10 |
{
|
11 |
-
"Site Name": "anfsorani.com
|
12 |
"Site URL": "https://anfsorani.com/هۆرامی",
|
13 |
"Category": "news",
|
14 |
"Confidence": "🟩",
|
|
|
8 |
"Supported by cis-lmu/Glot500": 0,
|
9 |
"Sites": [
|
10 |
{
|
11 |
+
"Site Name": "anfsorani.com",
|
12 |
"Site URL": "https://anfsorani.com/هۆرامی",
|
13 |
"Category": "news",
|
14 |
"Confidence": "🟩",
|
languages/lzz_Latn.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Language Name": "Laz",
|
3 |
+
"Family": "Kartvelian",
|
4 |
+
"Subgrouping": "Zan",
|
5 |
+
"Number of Speakers": "22_000",
|
6 |
+
"Supported by allenai/MADLAD-400": 0,
|
7 |
+
"Supported by facebook/flores": 0,
|
8 |
+
"Supported by cis-lmu/Glot500": 0,
|
9 |
+
"Sites": [
|
10 |
+
{
|
11 |
+
"Site Name": "kolkhoba.org",
|
12 |
+
"Site URL": "https://www.kolkhoba.org/lazuri.htm",
|
13 |
+
"Category": "articles",
|
14 |
+
"Confidence": "🟩",
|
15 |
+
"Info": "confirmed by webpage metadata",
|
16 |
+
"Possible Parallel Languages": "tur_Latn",
|
17 |
+
"Links": []
|
18 |
+
}
|
19 |
+
]
|
20 |
+
}
|
21 |
+
|
languages/snk_Latn.json
CHANGED
@@ -8,7 +8,7 @@
|
|
8 |
"Supported by cis-lmu/Glot500": 0,
|
9 |
"Sites": [
|
10 |
{
|
11 |
-
"Site Name": "soninkara.com
|
12 |
"Site URL": "http://www.soninkara.com/snk/",
|
13 |
"Category": "news,forums",
|
14 |
"Confidence": "🟩",
|
|
|
8 |
"Supported by cis-lmu/Glot500": 0,
|
9 |
"Sites": [
|
10 |
{
|
11 |
+
"Site Name": "soninkara.com",
|
12 |
"Site URL": "http://www.soninkara.com/snk/",
|
13 |
"Category": "news,forums",
|
14 |
"Confidence": "🟩",
|
languages/tet_Latn.json
CHANGED
@@ -17,13 +17,13 @@
|
|
17 |
"Links": []
|
18 |
},
|
19 |
{
|
20 |
-
"Site Name": "timor-leste.gov.tl
|
21 |
"Site URL": "http://timor-leste.gov.tl/?lang=tp",
|
22 |
"Category": "government",
|
23 |
"Confidence": "🟩",
|
24 |
"Info": "confirmed by webpage metadata as tet_Latn",
|
25 |
"Possible Parallel Languages": "eng_Latn, por_Latn",
|
26 |
-
"Links": []
|
27 |
},
|
28 |
{
|
29 |
"Site Name": "belun.tl",
|
@@ -32,7 +32,7 @@
|
|
32 |
"Confidence": "🟩",
|
33 |
"Info": "confirmed by webpage metadata as tet_Latn",
|
34 |
"Possible Parallel Languages": "eng_Latn",
|
35 |
-
"Links": []
|
36 |
},
|
37 |
{
|
38 |
"Site Name": "tempotimor.com",
|
|
|
17 |
"Links": []
|
18 |
},
|
19 |
{
|
20 |
+
"Site Name": "timor-leste.gov.tl",
|
21 |
"Site URL": "http://timor-leste.gov.tl/?lang=tp",
|
22 |
"Category": "government",
|
23 |
"Confidence": "🟩",
|
24 |
"Info": "confirmed by webpage metadata as tet_Latn",
|
25 |
"Possible Parallel Languages": "eng_Latn, por_Latn",
|
26 |
+
"Links": ["http://timor-leste.gov.tl/wp-content/uploads/2021/08/TT-2021-08-24-debate-PN_autorizacao_EE17.pdf"]
|
27 |
},
|
28 |
{
|
29 |
"Site Name": "belun.tl",
|
|
|
32 |
"Confidence": "🟩",
|
33 |
"Info": "confirmed by webpage metadata as tet_Latn",
|
34 |
"Possible Parallel Languages": "eng_Latn",
|
35 |
+
"Links": ["https://belun.tl/wp-content/uploads/2015/12/Relatoriu-Politika-CPD-RDTL.pdf"]
|
36 |
},
|
37 |
{
|
38 |
"Site Name": "tempotimor.com",
|