Update graph.py
Browse files
graph.py
CHANGED
|
@@ -4,17 +4,17 @@ from scipy import signal
|
|
| 4 |
import librosa
|
| 5 |
import subprocess
|
| 6 |
import matplotlib.pyplot as plt
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
wav = wav.mean(1)
|
| 14 |
if sr != 16000:
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
return
|
| 18 |
|
| 19 |
|
| 20 |
def normalise_transcript(xcp):
|
|
@@ -25,7 +25,20 @@ def normalise_transcript(xcp):
|
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
-
def get_pitch_tracks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
print('FILE PATH:', wav_path)
|
| 30 |
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
|
| 31 |
print('PLAIN:',f0_data)
|
|
@@ -35,19 +48,25 @@ def get_pitch_tracks(wav_path):
|
|
| 35 |
#print(f0_data)
|
| 36 |
f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
|
| 37 |
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
return f0_data
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
| 41 |
# transcript could be from a corpus with the wav file,
|
| 42 |
# input by the user,
|
| 43 |
# or from a previous speech recognition process
|
| 44 |
-
def align_and_graph(
|
| 45 |
|
| 46 |
plt.close('all')
|
| 47 |
-
|
| 48 |
|
| 49 |
# fetch data
|
| 50 |
-
speech =
|
| 51 |
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
|
| 52 |
|
| 53 |
|
|
@@ -55,7 +74,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
|
|
| 55 |
rec_start = w_align[0][1]
|
| 56 |
rec_end = w_align[-1][2]
|
| 57 |
|
| 58 |
-
f0_data = get_pitch_tracks(
|
| 59 |
if f0_data:
|
| 60 |
f_max = max([f0 for t,f0 in f0_data]) + 50
|
| 61 |
else:
|
|
@@ -85,7 +104,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
|
|
| 85 |
|
| 86 |
|
| 87 |
|
| 88 |
-
w, sr = librosa.load(
|
| 89 |
fr_l = 2048 # librosa default
|
| 90 |
h_l = 512 # default
|
| 91 |
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
|
|
|
|
| 4 |
import librosa
|
| 5 |
import subprocess
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
+
from pydub import AudioSegment
|
| 8 |
|
| 9 |
|
| 10 |
+
def readaud(sound_path):
|
| 11 |
+
aud, sr = sf.read(sound_path, dtype=np.float32)
|
| 12 |
+
if len(aud.shape) == 2:
|
| 13 |
+
aud = aud.mean(1)
|
|
|
|
| 14 |
if sr != 16000:
|
| 15 |
+
alen = int(aud.shape[0] / sr * 16000)
|
| 16 |
+
aud = signal.resample(aud, alen)
|
| 17 |
+
return aud
|
| 18 |
|
| 19 |
|
| 20 |
def normalise_transcript(xcp):
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
|
| 28 |
+
def get_pitch_tracks(sound_path):
|
| 29 |
+
|
| 30 |
+
orig_ftype = sound_path.split('.')[-1]
|
| 31 |
+
|
| 32 |
+
if orig_ftype == '.wav':
|
| 33 |
+
wav_path = sound_path
|
| 34 |
+
|
| 35 |
+
else:
|
| 36 |
+
aud_data = AudioSegment.from_file(sound_path, orig_ftype)
|
| 37 |
+
curdir = subprocess.run(["pwd"], capture_output=True, text=True).stdout
|
| 38 |
+
tmp_path = f'{curdir}/tmp.wav'
|
| 39 |
+
aud_data.export(tmp_path, format="wav")
|
| 40 |
+
wav_path = tmp_path
|
| 41 |
+
|
| 42 |
print('FILE PATH:', wav_path)
|
| 43 |
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
|
| 44 |
print('PLAIN:',f0_data)
|
|
|
|
| 48 |
#print(f0_data)
|
| 49 |
f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
|
| 50 |
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
|
| 51 |
+
|
| 52 |
+
if orig_ftype != '.wav':
|
| 53 |
+
subprocess.run(["rm", tmp_path])
|
| 54 |
+
|
| 55 |
return f0_data
|
| 56 |
|
| 57 |
|
| 58 |
+
|
| 59 |
+
|
| 60 |
# transcript could be from a corpus with the wav file,
|
| 61 |
# input by the user,
|
| 62 |
# or from a previous speech recognition process
|
| 63 |
+
def align_and_graph(sound_path, transcript, aligner_function):
|
| 64 |
|
| 65 |
plt.close('all')
|
| 66 |
+
|
| 67 |
|
| 68 |
# fetch data
|
| 69 |
+
speech = readaud(sound_path)
|
| 70 |
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
|
| 71 |
|
| 72 |
|
|
|
|
| 74 |
rec_start = w_align[0][1]
|
| 75 |
rec_end = w_align[-1][2]
|
| 76 |
|
| 77 |
+
f0_data = get_pitch_tracks(sound_path)
|
| 78 |
if f0_data:
|
| 79 |
f_max = max([f0 for t,f0 in f0_data]) + 50
|
| 80 |
else:
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
|
| 107 |
+
w, sr = librosa.load(sound_path)
|
| 108 |
fr_l = 2048 # librosa default
|
| 109 |
h_l = 512 # default
|
| 110 |
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
|