Spaces:

clr
/

length-contrast-data-isl

Sleeping

App Files Files Community

catiR commited on 24 days ago

Commit

1e483fc

1 Parent(s): 009ee74

data + demo

Browse files

Files changed (8) hide show

.gitignore +2 -0
Data/133_Annotated_Vowel_Lengths.pdf +0 -0
Data/Length_in_spoken_icelandic.json +0 -0
Data/Length_in_spoken_icelandic.tsv +0 -0
README.md +20 -1
app.py +204 -0
requirements.txt +2 -0
vowel_length.py +207 -0

.gitignore CHANGED Viewed

@@ -169,3 +169,5 @@ cython_debug/
 # PyPI configuration file
 .pypirc

 # PyPI configuration file
 .pypirc
+**/.DS_Store

Data/133_Annotated_Vowel_Lengths.pdf ADDED Viewed

Binary file (166 kB). View file

Data/Length_in_spoken_icelandic.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Data/Length_in_spoken_icelandic.tsv ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

	@@ -1 +1,20 @@
1	- ~~# length-contrast-data-isl~~

+---
+title: Length contrasts in spoken Icelandic
+emoji: 📊
+colorFrom: gray
+colorTo: green
+sdk: gradio
+sdk_version: 5.15.0
+app_file: app.py
+pinned: false
+---
+## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences for L1 and L2 Speakers: A Resource for Pronunciation Training
+#### NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia
+Authors: Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi
+Bergsson, Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon
+Gudnason
+### Get [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf) and annotations from the Data directory,
+### or [see the demo](https://huggingface.co/spaces/clr/length-contrast-data-isl)

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import gradio as gr
+import vowel_length as vln
+annotation_json = 'Data/Length_in_spoken_icelandic.json'
+menus, vdata = vln.setup(annotation_json)
+grouplist = [g for g,ws in menus]
+worddict = {g:ws for g,ws in menus}
+def get_group_words(group):
+	if group == '[NONE]':
+		choices = ['[NONE]']
+	else:
+		choices = [ '[ALL]' ] + [n for n,v in worddict[group]]
+	return gr.Dropdown(choices = choices, value = choices[0], interactive=True)
+def check_word_langs(word,cur_lang):
+	if ' [L' not in word:
+		return gr.Radio(value=cur_lang,interactive=True)
+	elif ' [L1]' in word:
+		return gr.Radio(value='L1',interactive=False)
+	else:
+		return gr.Radio(value='L2',interactive=False)
+def subset_words_spks(g,w,l,s,wsets,db):
+	if w == '[ALL]':
+		swords = [v for n,v in wsets[g]]
+		labl = g
+	else:
+		labl = w.split(' ')[0]
+		swords = [labl]
+	if l == 'All':
+		slang = ['L1', 'L2']
+		labl += f'\n L1+L2, '
+	else:
+		slang = [l]
+		labl += f'\n {l}, '
+	labl += f'{s}'
+	db1 = db.copy()
+	db1 = db1.loc[ (db1['speaker_lang'].isin(slang)) & (db1['word'].isin(swords)) ]
+	db1.reset_index()
+	if s.lower() == 'mfa':
+		src = 'mfa'
+	else:
+		assert s[:3].lower() == 'ann'
+		src = 'gold'
+	return db1, src, labl
+def plott(g1,w1,l1,s1,g2,w2,l2,s2):
+	dat1,src1,lab1 = subset_words_spks(g1,w1,l1,s1,worddict,vdata)
+	if '[NONE]' in [g2, w2]:
+		dat2, l2, src2, lab2 = None, None, None, None
+	else:
+		dat2,src2,lab2 = subset_words_spks(g2,w2,l2,s2,worddict,vdata)
+	fig = vln.vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2)
+	return fig
+bl = gr.Blocks(theme=gr.themes.Glass())
+with bl:
+	with gr.Tabs():
+		with gr.TabItem("Vowel quantity"):
+			with gr.Row():
+				with gr.Column():
+					gr.Markdown(
+					"""
+					#### Select data (1)
+					"""
+						)
+					gmenu1 = gr.Dropdown(choices=grouplist,label="Group", value='AL:')
+					wmenu1 = gr.Dropdown(label="Word", choices=['[ALL]'] + [n for n,v in worddict['AL:']])
+					lmenu1 = gr.Radio(["L1", "L2","All"],label="Speaker group",value="L1")
+					smenu1 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
+					gmenu1.change(get_group_words,inputs=[gmenu1],outputs = [wmenu1])
+					wmenu1.input(check_word_langs,inputs=[wmenu1,lmenu1],outputs = [lmenu1])
+				with gr.Column():
+					gr.Markdown(
+					"""
+					#### Select data (2)
+					"""
+						)
+					gmenu2 = gr.Dropdown(choices=['[NONE]'] + grouplist,label="Group", value='[NONE]')
+					wmenu2 = gr.Dropdown(label="Word", choices=['[NONE]'])
+					lmenu2 = gr.Radio(choices=["L1", "L2","All"],label="Speaker group",value="L1")
+					smenu2 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
+					gmenu2.change(get_group_words,inputs=[gmenu2],outputs = [wmenu2])
+					wmenu2.input(check_word_langs,inputs=[wmenu2,lmenu2],outputs = [lmenu2])
+			btn = gr.Button(value="Update Plot")
+			plo = gr.Plot()
+			btn.click(plott, [gmenu1,wmenu1,lmenu1,smenu1,gmenu2,wmenu2,lmenu2,smenu2], plo)
+			gr.Markdown(
+			"""
+			# Long and short Icelandic vowels
+			Check the About tab for more info about the project.
+			"""
+				)
+		with gr.TabItem("About"):
+			gr.Markdown(
+			"""
+			## Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
+			for L1 and L2 Speakers: A Resource for Pronunciation Training
+			"""
+				 )
+			gr.Markdown(
+				"""
+				## Demo: Viewing the data
+				Use the menus to choose words, speaker group, and data source.
+				Words are split into related groups and either the whole group or a single word can be selected.
+				Available speaker groups are native Icelandic speakers (L1), second-language speakers (L2), or all.
+				Data source options are gold (human) annotations or automated Montreal Forced Aligner (MFA).
+				The general expectation is that, all else being equal, syllables with long stressed vowels
+				followed by short consonants have a higher vowel:(vowel+consonant) duration ratio,
+				while syllables with short stressed vowels followed by long consonants have a lower ratio.
+				Many other factors also affect relative durations in any particular recorded token,
+				and these factors have considerable - not necessarily balanced - variation throughout this dataset.
+				This demo is provided to begin exploring the data and suggest hypotheses for follow-up.
+				See Pind 1999, 'Speech segment durations and quantity in Icelandic'
+				(J. Acoustical Society of America, 106(2)) for a review of the acoustics of Icelandic vowel duration.
+			"""
+				)
+			gr.Markdown(
+				"""
+			## Accessing the data
+			Annotations can be downloaded as
+			[json](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.json)
+			or [tsv](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.tsv) files.
+			See [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf)
+			for complete information.
+			Audio is available from [Clarin](https://repository.clarin.is/repository/xmlui/) (Samrómur).
+			The 'collection' field plus recording filename in the annotations metadata
+			specify the original audio file, including which Samrómur collection it is found in.
+			"""
+				 )
+			gr.Markdown(
+			"""
+			### About
+			This annotated data and its demo application accompany the paper
+			*Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
+			for L1 and L2 Speakers: A Resource for Pronunciation Training*, \
+			Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi Bergsson, \
+			Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon Gudnason - NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia.
+			### Contact [email protected] about bugs, feedback, or collaboration!
+			"""
+				 )
+if __name__ == "__main__":
+	bl.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ scipy
2	+ matplotlib

vowel_length.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os, json
+import numpy as np
+from collections import defaultdict
+import pandas as pd
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+# make subsets of words for convenience
+def make_sets(db,shorts,longs):
+	def _wspec(wd,l1,l2):
+		if (wd in l1) and (wd in l2):
+			return(wd,wd)
+		elif wd in l1:
+			return(f'{wd} [L1]',wd)
+		elif wd in l2:
+			return(f'{wd} [L2]',wd)
+		else:
+			return ('','')
+	def _ksrt(k):
+		if ' ' in k:
+			return((k[0],1/len(k)))
+		else:
+			return (k.replace(':',''),k[-1] )
+	words = set([(t['word'],t['speaker_lang']) for t in db])
+	l1 = [w for w,l in words if l == 'L1']
+	l2 = [w for w,l in words if l == 'L2']
+	words = set([w for w,l in words])
+	wdict = defaultdict(list)
+	for w in words:
+		if 'agg' in w:
+			wdict['AG:'].append(_wspec(w,l1,l2))
+		elif 'all' in w:
+			wdict['AL:'].append(_wspec(w,l1,l2))
+		elif 'egg' in w:
+			wdict['EG:'].append(_wspec(w,l1,l2))
+		elif 'eki' in w:
+			wdict['E:G'].append(_wspec(w,l1,l2))
+		elif 'aki' in w:
+			wdict['A:G'].append(_wspec(w,l1,l2))
+		elif 'ala' in w:
+			wdict['A:L'].append(_wspec(w,l1,l2))
+		elif w in shorts:
+			wdict['OTHER - SHORT'].append(_wspec(w,l1,l2))
+		elif w in longs:
+			wdict['OTHER - LONG'].append(_wspec(w,l1,l2))
+		else:
+			print(f'something should not have happened: {w}')
+	sets = [(k, sorted(wdict[k])) for k in sorted(list(wdict.keys()),key = _ksrt)]
+	return sets
+# compile data for a token record
+def get_tk_data(tk,shorts,longs):
+	# merge intervals
+	# from list of phones
+	# to word part
+	def _merge_intervals(plist):
+		if not plist:
+			return np.nan
+		tot_start, tot_end = plist[0]['start'],plist[-1]['end']
+		tot_dur = tot_end-tot_start
+		return tot_dur
+	tkdat = {}
+	tkdat['word'] = tk['word']
+	tkdat['speaker_lang'] = tk['speaker_lang']
+	tkdat['n_pre_phone'] = len(tk['gold_annotation']['prevowel'])
+	tkdat['n_post_phone'] = len(tk['gold_annotation']['postvowel'])
+	if tk['word'] in longs:
+		tkdat['vlen'] = 1
+	else:
+		assert tk['word'] in shorts
+		tkdat['vlen'] = 0
+	for s in ['gold','mfa']:
+		tkdat[f'{s}_pre_dur'] = _merge_intervals(tk[f'{s}_annotation']['prevowel'])
+		tkdat[f'{s}_v_dur'] = _merge_intervals(tk[f'{s}_annotation']['vowel'])
+		tkdat[f'{s}_post_dur'] = _merge_intervals(tk[f'{s}_annotation']['postvowel'])
+		tkdat[f'{s}_word_dur'] = tk[f'{s}_annotation']['target_word_end'] -\
+		  tk[f'{s}_annotation']['target_word_start']
+	return tkdat
+# code short vowels 0, long 1
+def prep_dat(d):
+	df = d.copy()
+	for s in ['gold','mfa']:
+		df[f'{s}_ratio'] = df[f'{s}_v_dur'] / (df[f'{s}_v_dur']+df[f'{s}_post_dur'])
+		df[f'{s}_pre_dur'] = df[f'{s}_pre_dur'].fillna(0) # set absent onsets dur zero
+	df = df.convert_dtypes()
+	return df
+def setup(annot_json):
+	longs = set(['aki', 'ala', 'baki', 'bera', 'betri', 'blaki', 'breki',
+				'brosir', 'dala', 'dreki', 'dvala', 'fala', 'fara', 'færa',
+				'færi', 'gala', 'hausinn', 'jónas', 'katrín', 'kisa', 'koma',
+				'leki', 'leyfa', 'maki', 'muna', 'nema', 'raki', 'sama',
+				'speki', 'svala', 'sækja', 'sömu', 'taki', 'tala', 'tvisvar',
+				'vala', 'veki', 'vinur', 'ása', 'þaki'])
+	shorts = set(['aggi', 'baggi', 'balla', 'beggi', 'eggi', 'farðu', 'fossinn',
+				'færði', 'galla', 'hausnum', 'herra', 'jónsson', 'kaggi', 'kalla',
+				'lalla', 'leggi', 'leyfðu', 'maggi', 'malla', 'mamma', 'missa',
+				'mömmu', 'nærri', 'palla', 'raggi', 'skeggi', 'snemma', 'sunna',
+				'tommi', 'veggi','vinnur', 'ásta'])
+	with open(annot_json, 'r') as handle:
+		db = json.load(handle)
+	sets = make_sets(db,shorts,longs)
+	db = [get_tk_data(tk,shorts,longs) for tk in db]
+	dat = pd.DataFrame.from_records(db)
+	dat = prep_dat(dat)
+	return sets,dat
+def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
+	def _gprep(df,l,s):
+		# color by length + speaker group
+		ccs = { "lAll" : (0.0, 0.749, 1.0),
+				"lL1" : (0.122, 0.467, 0.706),
+				"lL2" : (0.282, 0.82, 0.8),
+				"sAll" :(0.89, 0.467, 0.761),
+				"sL1" : (0.863, 0.078, 0.235),
+				"sL2" : (0.859, 0.439, 0.576),
+				"xAll" : (0.988, 0.69, 0.004),
+				"xL1" : (0.984, 0.49, 0.027),
+				"xL2" : (0.969, 0.835, 0.376)}
+		vdurs = np.array(df[f'{s}_v_dur'])*1000
+		cdurs = np.array(df[f'{s}_post_dur'])*1000
+		rto = np.mean(df[f'{s}_ratio'])
+		if sum(df['vlen']) == 0:
+			vl = 's'
+		elif sum(df['vlen']) == df.shape[0]:
+			vl = 'l'
+		else:
+			vl = 'x'
+		cc = ccs[f'{vl}{l}']
+		return vdurs, cdurs, rto, cc
+	vd1,cd1,ra1,cl1 = _gprep(dat1,l1,src1)
+	lab1 += f'\n Ratio: {ra1:.3f}'
+	if src1 == 'gold':
+		mk1 = '^'
+	else:
+		mk1 = '<'
+	fig, ax = plt.subplots(figsize=(9,7))
+	ax.set_xlim(0.0,350)
+	ax.set_ylim(0.0,350)
+	ax.scatter(vd1,cd1,marker = mk1, label = lab1,
+				   c = [cl1 + (.7,)], edgecolors = [cl1] )
+	if lab2:
+		vd2,cd2,ra2,cl2 = _gprep(dat2,l2,src2)
+		lab2 += f'\n Ratio: {ra2:.3f}'
+		if src2 == 'gold':
+			mk2 = 'v'
+		else:
+			mk2 = '>'
+		ax.scatter(vd2,cd2, marker = mk2, label = lab2,
+					c = [cl2 + (.05,)], edgecolors = [cl2] )
+	ax.set_title("Stressed vowel & following consonant(s) duration" )
+	ax.set_xlabel("Vowel duration (ms)")
+	ax.set_ylabel("Consonant duration (ms)")
+	#fig.legend(loc=8,ncols=2)
+	fig.legend(loc=7)
+	ax.axline((0,0),slope=1,color="darkgray")
+	fig.tight_layout()
+	#fig.subplots_adjust(bottom=0.15)
+	fig.subplots_adjust(right=0.75)
+	#plt.xticks(ticks=[50,100,150,200,250,300],labels=[])
+	#plt.yticks(ticks=[100,200,300],labels=[])
+	return fig