catiR commited on
Commit
1e483fc
·
1 Parent(s): 009ee74

data + demo

Browse files
.gitignore CHANGED
@@ -169,3 +169,5 @@ cython_debug/
169
 
170
  # PyPI configuration file
171
  .pypirc
 
 
 
169
 
170
  # PyPI configuration file
171
  .pypirc
172
+
173
+ **/.DS_Store
Data/133_Annotated_Vowel_Lengths.pdf ADDED
Binary file (166 kB). View file
 
Data/Length_in_spoken_icelandic.json ADDED
The diff for this file is too large to render. See raw diff
 
Data/Length_in_spoken_icelandic.tsv ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1 +1,20 @@
1
- # length-contrast-data-isl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Length contrasts in spoken Icelandic
3
+ emoji: 📊
4
+ colorFrom: gray
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.15.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ ## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences for L1 and L2 Speakers: A Resource for Pronunciation Training
13
+
14
+ #### NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia
15
+ Authors: Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi
16
+ Bergsson, Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon
17
+ Gudnason
18
+
19
+ ### Get [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf) and annotations from the Data directory,
20
+ ### or [see the demo](https://huggingface.co/spaces/clr/length-contrast-data-isl)
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import vowel_length as vln
3
+
4
+
5
+
6
+
7
+ annotation_json = 'Data/Length_in_spoken_icelandic.json'
8
+
9
+ menus, vdata = vln.setup(annotation_json)
10
+
11
+
12
+ grouplist = [g for g,ws in menus]
13
+ worddict = {g:ws for g,ws in menus}
14
+
15
+
16
+
17
+
18
+ def get_group_words(group):
19
+ if group == '[NONE]':
20
+ choices = ['[NONE]']
21
+ else:
22
+ choices = [ '[ALL]' ] + [n for n,v in worddict[group]]
23
+ return gr.Dropdown(choices = choices, value = choices[0], interactive=True)
24
+
25
+ def check_word_langs(word,cur_lang):
26
+ if ' [L' not in word:
27
+ return gr.Radio(value=cur_lang,interactive=True)
28
+ elif ' [L1]' in word:
29
+ return gr.Radio(value='L1',interactive=False)
30
+ else:
31
+ return gr.Radio(value='L2',interactive=False)
32
+
33
+
34
+
35
+ def subset_words_spks(g,w,l,s,wsets,db):
36
+ if w == '[ALL]':
37
+ swords = [v for n,v in wsets[g]]
38
+ labl = g
39
+ else:
40
+ labl = w.split(' ')[0]
41
+ swords = [labl]
42
+
43
+ if l == 'All':
44
+ slang = ['L1', 'L2']
45
+ labl += f'\n L1+L2, '
46
+ else:
47
+ slang = [l]
48
+ labl += f'\n {l}, '
49
+
50
+ labl += f'{s}'
51
+
52
+ db1 = db.copy()
53
+ db1 = db1.loc[ (db1['speaker_lang'].isin(slang)) & (db1['word'].isin(swords)) ]
54
+ db1.reset_index()
55
+
56
+ if s.lower() == 'mfa':
57
+ src = 'mfa'
58
+ else:
59
+ assert s[:3].lower() == 'ann'
60
+ src = 'gold'
61
+
62
+ return db1, src, labl
63
+
64
+
65
+
66
+ def plott(g1,w1,l1,s1,g2,w2,l2,s2):
67
+
68
+ dat1,src1,lab1 = subset_words_spks(g1,w1,l1,s1,worddict,vdata)
69
+
70
+ if '[NONE]' in [g2, w2]:
71
+ dat2, l2, src2, lab2 = None, None, None, None
72
+ else:
73
+ dat2,src2,lab2 = subset_words_spks(g2,w2,l2,s2,worddict,vdata)
74
+
75
+ fig = vln.vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2)
76
+
77
+ return fig
78
+
79
+
80
+
81
+ bl = gr.Blocks(theme=gr.themes.Glass())
82
+
83
+ with bl:
84
+
85
+ with gr.Tabs():
86
+
87
+ with gr.TabItem("Vowel quantity"):
88
+
89
+ with gr.Row():
90
+ with gr.Column():
91
+ gr.Markdown(
92
+ """
93
+ #### Select data (1)
94
+ """
95
+ )
96
+ gmenu1 = gr.Dropdown(choices=grouplist,label="Group", value='AL:')
97
+ wmenu1 = gr.Dropdown(label="Word", choices=['[ALL]'] + [n for n,v in worddict['AL:']])
98
+ lmenu1 = gr.Radio(["L1", "L2","All"],label="Speaker group",value="L1")
99
+ smenu1 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
100
+
101
+ gmenu1.change(get_group_words,inputs=[gmenu1],outputs = [wmenu1])
102
+ wmenu1.input(check_word_langs,inputs=[wmenu1,lmenu1],outputs = [lmenu1])
103
+
104
+
105
+ with gr.Column():
106
+ gr.Markdown(
107
+ """
108
+ #### Select data (2)
109
+ """
110
+ )
111
+ gmenu2 = gr.Dropdown(choices=['[NONE]'] + grouplist,label="Group", value='[NONE]')
112
+ wmenu2 = gr.Dropdown(label="Word", choices=['[NONE]'])
113
+ lmenu2 = gr.Radio(choices=["L1", "L2","All"],label="Speaker group",value="L1")
114
+ smenu2 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
115
+
116
+ gmenu2.change(get_group_words,inputs=[gmenu2],outputs = [wmenu2])
117
+ wmenu2.input(check_word_langs,inputs=[wmenu2,lmenu2],outputs = [lmenu2])
118
+
119
+
120
+ btn = gr.Button(value="Update Plot")
121
+ plo = gr.Plot()
122
+ btn.click(plott, [gmenu1,wmenu1,lmenu1,smenu1,gmenu2,wmenu2,lmenu2,smenu2], plo)
123
+
124
+
125
+
126
+
127
+ gr.Markdown(
128
+ """
129
+ # Long and short Icelandic vowels
130
+ Check the About tab for more info about the project.
131
+ """
132
+ )
133
+
134
+
135
+ with gr.TabItem("About"):
136
+ gr.Markdown(
137
+ """
138
+ ## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
139
+ for L1 and L2 Speakers: A Resource for Pronunciation Training
140
+ """
141
+ )
142
+
143
+ gr.Markdown(
144
+ """
145
+ ## Demo: Viewing the data
146
+ Use the menus to choose words, speaker group, and data source.
147
+ Words are split into related groups and either the whole group or a single word can be selected.
148
+ Available speaker groups are native Icelandic speakers (L1), second-language speakers (L2), or all.
149
+ Data source options are gold (human) annotations or automated Montreal Forced Aligner (MFA).
150
+
151
+ The general expectation is that, all else being equal, syllables with long stressed vowels
152
+ followed by short consonants have a higher vowel:(vowel+consonant) duration ratio,
153
+ while syllables with short stressed vowels followed by long consonants have a lower ratio.
154
+
155
+ Many other factors also affect relative durations in any particular recorded token,
156
+ and these factors have considerable - not necessarily balanced - variation throughout this dataset.
157
+ This demo is provided to begin exploring the data and suggest hypotheses for follow-up.
158
+ See Pind 1999, 'Speech segment durations and quantity in Icelandic'
159
+ (J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration.
160
+ """
161
+ )
162
+
163
+
164
+
165
+ gr.Markdown(
166
+ """
167
+ ## Accessing the data
168
+
169
+ Annotations can be downloaded as
170
+ [json](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.json)
171
+ or [tsv](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.tsv) files.
172
+ See [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf)
173
+ for complete information.
174
+
175
+ Audio is available from [Clarin](https://repository.clarin.is/repository/xmlui/) (Samrómur).
176
+ The 'collection' field plus recording filename in the annotations metadata
177
+ specify the original audio file, including which Samrómur collection it is found in.
178
+ """
179
+ )
180
+
181
+
182
+ gr.Markdown(
183
+ """
184
+ ### About
185
+
186
+ This annotated data and its demo application accompany the paper
187
+ *Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
188
+ for L1 and L2 Speakers: A Resource for Pronunciation Training*, \
189
+ Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi Bergsson, \
190
+ Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon Gudnason - NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia.
191
+
192
+
193
+ ### Contact [email protected] about bugs, feedback, or collaboration!
194
+
195
+ """
196
+ )
197
+
198
+
199
+
200
+
201
+ if __name__ == "__main__":
202
+ bl.launch()
203
+
204
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scipy
2
+ matplotlib
vowel_length.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json
2
+ import numpy as np
3
+ from collections import defaultdict
4
+ import pandas as pd
5
+ import matplotlib
6
+ matplotlib.use('Agg')
7
+ import matplotlib.pyplot as plt
8
+
9
+
10
+ # make subsets of words for convenience
11
+ def make_sets(db,shorts,longs):
12
+
13
+ def _wspec(wd,l1,l2):
14
+ if (wd in l1) and (wd in l2):
15
+ return(wd,wd)
16
+ elif wd in l1:
17
+ return(f'{wd} [L1]',wd)
18
+ elif wd in l2:
19
+ return(f'{wd} [L2]',wd)
20
+ else:
21
+ return ('','')
22
+
23
+ def _ksrt(k):
24
+ if ' ' in k:
25
+ return((k[0],1/len(k)))
26
+ else:
27
+ return (k.replace(':',''),k[-1] )
28
+
29
+ words = set([(t['word'],t['speaker_lang']) for t in db])
30
+ l1 = [w for w,l in words if l == 'L1']
31
+ l2 = [w for w,l in words if l == 'L2']
32
+ words = set([w for w,l in words])
33
+
34
+ wdict = defaultdict(list)
35
+ for w in words:
36
+ if 'agg' in w:
37
+ wdict['AG:'].append(_wspec(w,l1,l2))
38
+ elif 'all' in w:
39
+ wdict['AL:'].append(_wspec(w,l1,l2))
40
+ elif 'egg' in w:
41
+ wdict['EG:'].append(_wspec(w,l1,l2))
42
+ elif 'eki' in w:
43
+ wdict['E:G'].append(_wspec(w,l1,l2))
44
+ elif 'aki' in w:
45
+ wdict['A:G'].append(_wspec(w,l1,l2))
46
+ elif 'ala' in w:
47
+ wdict['A:L'].append(_wspec(w,l1,l2))
48
+ elif w in shorts:
49
+ wdict['OTHER - SHORT'].append(_wspec(w,l1,l2))
50
+ elif w in longs:
51
+ wdict['OTHER - LONG'].append(_wspec(w,l1,l2))
52
+ else:
53
+ print(f'something should not have happened: {w}')
54
+
55
+
56
+ sets = [(k, sorted(wdict[k])) for k in sorted(list(wdict.keys()),key = _ksrt)]
57
+
58
+ return sets
59
+
60
+
61
+ # compile data for a token record
62
+ def get_tk_data(tk,shorts,longs):
63
+
64
+ # merge intervals
65
+ # from list of phones
66
+ # to word part
67
+ def _merge_intervals(plist):
68
+ if not plist:
69
+ return np.nan
70
+ tot_start, tot_end = plist[0]['start'],plist[-1]['end']
71
+ tot_dur = tot_end-tot_start
72
+ return tot_dur
73
+
74
+ tkdat = {}
75
+ tkdat['word'] = tk['word']
76
+ tkdat['speaker_lang'] = tk['speaker_lang']
77
+ tkdat['n_pre_phone'] = len(tk['gold_annotation']['prevowel'])
78
+ tkdat['n_post_phone'] = len(tk['gold_annotation']['postvowel'])
79
+
80
+ if tk['word'] in longs:
81
+ tkdat['vlen'] = 1
82
+ else:
83
+ assert tk['word'] in shorts
84
+ tkdat['vlen'] = 0
85
+
86
+ for s in ['gold','mfa']:
87
+ tkdat[f'{s}_pre_dur'] = _merge_intervals(tk[f'{s}_annotation']['prevowel'])
88
+ tkdat[f'{s}_v_dur'] = _merge_intervals(tk[f'{s}_annotation']['vowel'])
89
+ tkdat[f'{s}_post_dur'] = _merge_intervals(tk[f'{s}_annotation']['postvowel'])
90
+ tkdat[f'{s}_word_dur'] = tk[f'{s}_annotation']['target_word_end'] -\
91
+ tk[f'{s}_annotation']['target_word_start']
92
+
93
+ return tkdat
94
+
95
+
96
+ # code short vowels 0, long 1
97
+ def prep_dat(d):
98
+ df = d.copy()
99
+ for s in ['gold','mfa']:
100
+ df[f'{s}_ratio'] = df[f'{s}_v_dur'] / (df[f'{s}_v_dur']+df[f'{s}_post_dur'])
101
+ df[f'{s}_pre_dur'] = df[f'{s}_pre_dur'].fillna(0) # set absent onsets dur zero
102
+ df = df.convert_dtypes()
103
+ return df
104
+
105
+
106
+ def setup(annot_json):
107
+
108
+ longs = set(['aki', 'ala', 'baki', 'bera', 'betri', 'blaki', 'breki',
109
+ 'brosir', 'dala', 'dreki', 'dvala', 'fala', 'fara', 'færa',
110
+ 'færi', 'gala', 'hausinn', 'jónas', 'katrín', 'kisa', 'koma',
111
+ 'leki', 'leyfa', 'maki', 'muna', 'nema', 'raki', 'sama',
112
+ 'speki', 'svala', 'sækja', 'sömu', 'taki', 'tala', 'tvisvar',
113
+ 'vala', 'veki', 'vinur', 'ása', 'þaki'])
114
+
115
+ shorts = set(['aggi', 'baggi', 'balla', 'beggi', 'eggi', 'farðu', 'fossinn',
116
+ 'færði', 'galla', 'hausnum', 'herra', 'jónsson', 'kaggi', 'kalla',
117
+ 'lalla', 'leggi', 'leyfðu', 'maggi', 'malla', 'mamma', 'missa',
118
+ 'mömmu', 'nærri', 'palla', 'raggi', 'skeggi', 'snemma', 'sunna',
119
+ 'tommi', 'veggi','vinnur', 'ásta'])
120
+
121
+ with open(annot_json, 'r') as handle:
122
+ db = json.load(handle)
123
+
124
+ sets = make_sets(db,shorts,longs)
125
+
126
+ db = [get_tk_data(tk,shorts,longs) for tk in db]
127
+ dat = pd.DataFrame.from_records(db)
128
+ dat = prep_dat(dat)
129
+
130
+ return sets,dat
131
+
132
+
133
+
134
+ def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
135
+
136
+ def _gprep(df,l,s):
137
+
138
+ # color by length + speaker group
139
+ ccs = { "lAll" : (0.0, 0.749, 1.0),
140
+ "lL1" : (0.122, 0.467, 0.706),
141
+ "lL2" : (0.282, 0.82, 0.8),
142
+ "sAll" :(0.89, 0.467, 0.761),
143
+ "sL1" : (0.863, 0.078, 0.235),
144
+ "sL2" : (0.859, 0.439, 0.576),
145
+ "xAll" : (0.988, 0.69, 0.004),
146
+ "xL1" : (0.984, 0.49, 0.027),
147
+ "xL2" : (0.969, 0.835, 0.376)}
148
+
149
+ vdurs = np.array(df[f'{s}_v_dur'])*1000
150
+ cdurs = np.array(df[f'{s}_post_dur'])*1000
151
+ rto = np.mean(df[f'{s}_ratio'])
152
+
153
+ if sum(df['vlen']) == 0:
154
+ vl = 's'
155
+ elif sum(df['vlen']) == df.shape[0]:
156
+ vl = 'l'
157
+ else:
158
+ vl = 'x'
159
+
160
+ cc = ccs[f'{vl}{l}']
161
+
162
+ return vdurs, cdurs, rto, cc
163
+
164
+
165
+ vd1,cd1,ra1,cl1 = _gprep(dat1,l1,src1)
166
+ lab1 += f'\n Ratio: {ra1:.3f}'
167
+ if src1 == 'gold':
168
+ mk1 = '^'
169
+ else:
170
+ mk1 = '<'
171
+
172
+
173
+ fig, ax = plt.subplots(figsize=(9,7))
174
+ ax.set_xlim(0.0,350)
175
+ ax.set_ylim(0.0,350)
176
+
177
+ ax.scatter(vd1,cd1,marker = mk1, label = lab1,
178
+ c = [cl1 + (.7,)], edgecolors = [cl1] )
179
+
180
+ if lab2:
181
+ vd2,cd2,ra2,cl2 = _gprep(dat2,l2,src2)
182
+ lab2 += f'\n Ratio: {ra2:.3f}'
183
+ if src2 == 'gold':
184
+ mk2 = 'v'
185
+ else:
186
+ mk2 = '>'
187
+ ax.scatter(vd2,cd2, marker = mk2, label = lab2,
188
+ c = [cl2 + (.05,)], edgecolors = [cl2] )
189
+
190
+
191
+ ax.set_title("Stressed vowel & following consonant(s) duration" )
192
+ ax.set_xlabel("Vowel duration (ms)")
193
+ ax.set_ylabel("Consonant duration (ms)")
194
+ #fig.legend(loc=8,ncols=2)
195
+ fig.legend(loc=7)
196
+
197
+ ax.axline((0,0),slope=1,color="darkgray")
198
+
199
+ fig.tight_layout()
200
+ #fig.subplots_adjust(bottom=0.15)
201
+ fig.subplots_adjust(right=0.75)
202
+
203
+ #plt.xticks(ticks=[50,100,150,200,250,300],labels=[])
204
+ #plt.yticks(ticks=[100,200,300],labels=[])
205
+
206
+ return fig
207
+