Spaces:

clr
/

length-contrast-data-isl

Sleeping

App Files Files Community

catiR commited on 12 days ago

Commit

ecd5f69

1 Parent(s): bd7c83f

densities

Browse files

Files changed (2) hide show

app.py +75 -14
vowel_length.py +83 -14

app.py CHANGED Viewed

@@ -78,7 +78,7 @@ def plott(g1,w1,l1,s1,g2,w2,l2,s2):
-bl = gr.Blocks(theme=gr.themes.Glass())
 with bl:
@@ -108,8 +108,8 @@ with bl:
 					#### Select data (2)
 					"""
 						)
-					gmenu2 = gr.Dropdown(choices=['[NONE]'] + grouplist,label="Group", value='[NONE]')
-					wmenu2 = gr.Dropdown(label="Word", choices=['[NONE]'])
 					lmenu2 = gr.Radio(choices=["L1", "L2","All"],label="Speaker group",value="L1")
 					smenu2 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
@@ -118,7 +118,7 @@ with bl:
 			btn = gr.Button(value="Update Plot")
-			plo = gr.Plot()
 			btn.click(plott, [gmenu1,wmenu1,lmenu1,smenu1,gmenu2,wmenu2,lmenu2,smenu2], plo)
@@ -140,6 +140,21 @@ with bl:
 			"""
 				 )
 			gr.Markdown(
 				"""
 				## Demo: Viewing the data
@@ -148,6 +163,9 @@ with bl:
 				Available speaker groups are native Icelandic speakers (L1), second-language speakers (L2), or all.
 				Data source options are gold (human) annotations or automated Montreal Forced Aligner (MFA).
 				The general expectation is that, all else being equal, syllables with long stressed vowels
 				followed by short consonants have a higher vowel:(vowel+consonant) duration ratio,
 				while syllables with short stressed vowels followed by long consonants have a lower ratio.
@@ -161,7 +179,6 @@ with bl:
 				)
 			gr.Markdown(
 				"""
 			## Accessing the data
@@ -171,25 +188,69 @@ with bl:
 			or [tsv](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.tsv) files.
 			See [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf)
 			for complete information.
 			Audio is available from [Clarin](https://repository.clarin.is/repository/xmlui/) (Samrómur).
 			The 'collection' field plus recording filename in the annotations metadata
 			specify the original audio file, including which Samrómur collection it is found in.
 			"""
 				 )
 			gr.Markdown(
-			"""
-			### About
-			This annotated data and its demo application accompany the paper
-			*Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
-			for L1 and L2 Speakers: A Resource for Pronunciation Training*, \
-			Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi Bergsson, \
-			Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon Gudnason - NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia.
 			### Contact [email protected] about bugs, feedback, or collaboration!
 			"""

+bl = gr.Blocks()#theme=gr.themes.Glass())
 with bl:
 					#### Select data (2)
 					"""
 						)
+					gmenu2 = gr.Dropdown(choices=['[NONE]'] + grouplist,label="Group", value='A:L')
+					wmenu2 = gr.Dropdown(label="Word", choices=['[ALL]'] + [n for n,v in worddict['A:L']])
 					lmenu2 = gr.Radio(choices=["L1", "L2","All"],label="Speaker group",value="L1")
 					smenu2 = gr.Dropdown(["Annotated", "MFA"],label="Source",value="Annotated")
 			btn = gr.Button(value="Update Plot")
+			plo = gr.Plot(value=plott('AL:','[ALL]',"L1","Annotated",'A:L','[ALL]',"L1","Annotated"))
 			btn.click(plott, [gmenu1,wmenu1,lmenu1,smenu1,gmenu2,wmenu2,lmenu2,smenu2], plo)
 			"""
 				 )
+			gr.Markdown(
+			"""
+			### About
+			This annotated data and its demo application accompany the paper
+			*Assessed and Annotated Vowel Lengths in Spoken Icelandic Sentences\
+			for L1 and L2 Speakers: A Resource for Pronunciation Training*, \
+			Caitlin Laura Richter, Kolbrún Friðriksdóttir, Kormákur Logi Bergsson, \
+			Erik Anders Maher, Ragnheiður María Benediktsdóttir, Jon Gudnason - NoDaLiDa/Baltic-HLT 2025, Tallinn, Estonia.
+			"""
+				 )
 			gr.Markdown(
 				"""
 				## Demo: Viewing the data
 				Available speaker groups are native Icelandic speakers (L1), second-language speakers (L2), or all.
 				Data source options are gold (human) annotations or automated Montreal Forced Aligner (MFA).
+				The display is a scatter plot of vowel and consonant durations,
+				supplemented with density plots for each dimension separately.
 				The general expectation is that, all else being equal, syllables with long stressed vowels
 				followed by short consonants have a higher vowel:(vowel+consonant) duration ratio,
 				while syllables with short stressed vowels followed by long consonants have a lower ratio.
 				)
 			gr.Markdown(
 				"""
 			## Accessing the data
 			or [tsv](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/Length_in_spoken_icelandic.tsv) files.
 			See [the paper](https://github.com/catiR/length-contrast-data-isl/blob/main/Data/133_Annotated_Vowel_Lengths.pdf)
 			for complete information.
+			"""
+				 )
+			gr.Markdown(
+				"""
 			Audio is available from [Clarin](https://repository.clarin.is/repository/xmlui/) (Samrómur).
 			The 'collection' field plus recording filename in the annotations metadata
 			specify the original audio file, including which Samrómur collection it is found in.
 			"""
 				 )
 			gr.Markdown(
+				"""
+Annotation records are in the following scheme:
+```
+[ { recording: source-file-id.wav,
+    collection: samromur-collection,
+    speaker_lang: L1/L2,
+    word: target-word,
+    word_context: {
+        normalised: normalised-carrier-sentence-text,
+        before: sentence-context-preceding-token,
+        after: sentence-context-following-token
+    },
+    gold_annotation: {
+        target_word_start: seconds,
+        target_word_end: seconds,
+        prevowel: [ {
+                phone: ipa-character,
+                start: seconds,
+                end: seconds,
+                },
+                { phone2 ... } ,
+        ],
+        vowel: [ {
+                phone: ipa-character,
+                start: seconds,
+                end: seconds,
+                },
+        ],
+        postvowel: [ {
+                phone: ipa-character,
+                start: seconds,
+                end: seconds,
+                },
+        ]
+    },
+    mfa_annotation : {
+     ... as for gold ...
+    }
+ },
+]
+```
+				"""
+				 )
+			gr.Markdown(
+			"""
 			### Contact [email protected] about bugs, feedback, or collaboration!
 			"""

vowel_length.py CHANGED Viewed

@@ -5,7 +5,9 @@ import pandas as pd
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
 # make subsets of words for convenience
 def make_sets(db,shorts,longs):
@@ -70,7 +72,7 @@ def get_tk_data(tk,shorts,longs):
 		tot_start, tot_end = plist[0]['start'],plist[-1]['end']
 		tot_dur = tot_end-tot_start
 		return tot_dur
 	tkdat = {}
 	tkdat['word'] = tk['word']
 	tkdat['speaker_lang'] = tk['speaker_lang']
@@ -89,7 +91,7 @@ def get_tk_data(tk,shorts,longs):
 		tkdat[f'{s}_post_dur'] = _merge_intervals(tk[f'{s}_annotation']['postvowel'])
 		tkdat[f'{s}_word_dur'] = tk[f'{s}_annotation']['target_word_end'] -\
 		  tk[f'{s}_annotation']['target_word_start']
 	return tkdat
@@ -118,18 +120,44 @@ def setup(annot_json):
 				'mömmu', 'nærri', 'palla', 'raggi', 'skeggi', 'snemma', 'sunna',
 				'tommi', 'veggi','vinnur', 'ásta'])
 	with open(annot_json, 'r') as handle:
 		db = json.load(handle)
 	sets = make_sets(db,shorts,longs)
 	db = [get_tk_data(tk,shorts,longs) for tk in db]
 	dat = pd.DataFrame.from_records(db)
 	dat = prep_dat(dat)
 	return sets,dat
 def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
@@ -161,6 +189,7 @@ def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
 		return vdurs, cdurs, rto, cc
 	vd1,cd1,ra1,cl1 = _gprep(dat1,l1,src1)
 	lab1 += f'\n Ratio: {ra1:.3f}'
@@ -171,12 +200,16 @@ def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
 	fig, ax = plt.subplots(figsize=(9,7))
-	ax.set_xlim(0.0,350)
-	ax.set_ylim(0.0,350)
 	ax.scatter(vd1,cd1,marker = mk1, label = lab1,
 				   c = [cl1 + (.7,)], edgecolors = [cl1] )
 	if lab2:
 		vd2,cd2,ra2,cl2 = _gprep(dat2,l2,src2)
 		lab2 += f'\n Ratio: {ra2:.3f}'
@@ -186,22 +219,58 @@ def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
 			mk2 = '>'
 		ax.scatter(vd2,cd2, marker = mk2, label = lab2,
 					c = [cl2 + (.05,)], edgecolors = [cl2] )
-	ax.set_title("Stressed vowel & following consonant(s) duration" )
 	ax.set_xlabel("Vowel duration (ms)")
 	ax.set_ylabel("Consonant duration (ms)")
-	#fig.legend(loc=8,ncols=2)
-	fig.legend(loc=7)
-	ax.axline((0,0),slope=1,color="darkgray")
 	fig.tight_layout()
-	#fig.subplots_adjust(bottom=0.15)
-	fig.subplots_adjust(right=0.75)
 	#plt.xticks(ticks=[50,100,150,200,250,300],labels=[])
 	#plt.yticks(ticks=[100,200,300],labels=[])
 	return fig

 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
+from scipy.stats import gaussian_kde
+#from scipy.spatial import KDTree
+#from sklearn.neighbors import NearestNeighbors
 # make subsets of words for convenience
 def make_sets(db,shorts,longs):
 		tot_start, tot_end = plist[0]['start'],plist[-1]['end']
 		tot_dur = tot_end-tot_start
 		return tot_dur
 	tkdat = {}
 	tkdat['word'] = tk['word']
 	tkdat['speaker_lang'] = tk['speaker_lang']
 		tkdat[f'{s}_post_dur'] = _merge_intervals(tk[f'{s}_annotation']['postvowel'])
 		tkdat[f'{s}_word_dur'] = tk[f'{s}_annotation']['target_word_end'] -\
 		  tk[f'{s}_annotation']['target_word_start']
 	return tkdat
 				'mömmu', 'nærri', 'palla', 'raggi', 'skeggi', 'snemma', 'sunna',
 				'tommi', 'veggi','vinnur', 'ásta'])
+	# very basic remove about 5 outliers > 350ms
+	cut=0.35
 	with open(annot_json, 'r') as handle:
 		db = json.load(handle)
 	sets = make_sets(db,shorts,longs)
 	db = [get_tk_data(tk,shorts,longs) for tk in db]
+	db = [t for t in db if ((t['gold_v_dur'] <=cut) and (t['gold_post_dur'] <=cut))]
 	dat = pd.DataFrame.from_records(db)
 	dat = prep_dat(dat)
 	return sets,dat
+def kldiv(s1,s2):
+	_log = lambda x: np.log2(x) if x != 0 else 0
+	_log = np.vectorize(_log)
+	n, m = len(s1), len(s2)
+	d = s1.shape[1]
+	assert d == 2 == s2.shape[1]
+	k = 1
+	while True:
+		knn1 = NearestNeighbors(n_neighbors = k+1).fit(s1)
+		nnDist1 = knn1.kneighbors(s1)[0][:, k]
+		if not nnDist1.all():
+			k += 1
+		else:
+			break
+	knn2 = NearestNeighbors(n_neighbors = k).fit(s2)
+	nnDist2 = knn2.kneighbors(s1)[0][:, k-1]
+	kl = (d/n) * sum(_log(nnDist2/nnDist1)) + _log((m/(n-1)))
+	return kl
 def vgraph(dat1,l1,src1,lab1,dat2,l2,src2,lab2):
 		return vdurs, cdurs, rto, cc
+	plt.close()
 	vd1,cd1,ra1,cl1 = _gprep(dat1,l1,src1)
 	lab1 += f'\n Ratio: {ra1:.3f}'
 	fig, ax = plt.subplots(figsize=(9,7))
+	#ax.set_xlim(0.0, 350)
+	#ax.set_ylim(0.0, 350)
 	ax.scatter(vd1,cd1,marker = mk1, label = lab1,
 				   c = [cl1 + (.7,)], edgecolors = [cl1] )
+	marginals = [(vd1, 'x', l1, cl1),
+				(cd1, 'y', l1, cl1)]
+	#kld = None
 	if lab2:
 		vd2,cd2,ra2,cl2 = _gprep(dat2,l2,src2)
 		lab2 += f'\n Ratio: {ra2:.3f}'
 			mk2 = '>'
 		ax.scatter(vd2,cd2, marker = mk2, label = lab2,
 					c = [cl2 + (.05,)], edgecolors = [cl2] )
+		#s1 = np.transpose(np.array([vd1,cd1]))
+		#s2 = np.transpose(np.array([vd2,cd2]))
+		#klda = kldiv(s1,s2)
+		#if klda:
+		#	kldb = kldiv(s2,s1)
+		#	kldsym = np.mean([klda,kldb])
+		#	if not np.isnan(kldsym):
+		#		ax.scatter([-300],[-300],c = 'white',label = f'\nKLDiv: {kldsym:.2f}')
+		marginals += [(vd2, 'x', l2, cl2),
+					(cd2, 'y', l2, cl2)]
+	#fig.legend(loc=8,ncols=2)
+	leg = fig.legend(loc=7,frameon=False)
+	for t in leg.get_texts():
+		t.set_verticalalignment("center_baseline")
+	ax.axline((0,0),slope=1,color="darkgray")
+	marginals = [m for m in marginals if len(m[0])>9]
+	lsts = {'L1': 'solid' , 'L2': 'dashed' , 'All': 'dashdot'}
+	for values, axt, lng, lcl in marginals:
+		kde = gaussian_kde(values, bw_method='scott')
+		pts = np.linspace(np.min(values), np.max(values))
+		dens = kde.pdf(pts)
+		scf=2500
+		lst = lsts[lng]
+		#l2dat = ax.plot(pts, [350-(scf*i) for i in dens], linestyle=lst, color = lcl)
+		l2dat = ax.plot(pts, [350+(scf*i) for i in dens], linestyle=lst, color = lcl, clip_on=False)
+		if axt == 'y':
+			for l2d in l2dat:
+				xln = l2d.get_xdata()
+				yln = l2d.get_ydata()
+				l2d.set_xdata(yln)
+				l2d.set_ydata(xln)
+				fig.canvas.draw()
+				#ax.draw_artist(l2d)
+	ax.set_xlim(0.0, 350)
+	ax.set_ylim(0.0, 350)
+	ax.set_title("Stressed vowel & following consonant(s) duration" , fontsize=16, y=-.155)
 	ax.set_xlabel("Vowel duration (ms)")
 	ax.set_ylabel("Consonant duration (ms)")
 	fig.tight_layout()
+	fig.subplots_adjust(bottom=0.13)
+	fig.subplots_adjust(right=0.72)
 	#plt.xticks(ticks=[50,100,150,200,250,300],labels=[])
 	#plt.yticks(ticks=[100,200,300],labels=[])
 	return fig