Upload 2 files
Browse files- ctcalign.py +166 -169
- graph.py +3 -5
ctcalign.py
CHANGED
|
@@ -4,108 +4,98 @@ import numpy as np
|
|
| 4 |
from dataclasses import dataclass
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
return fr/50
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
#
|
| 17 |
-
#------------------------------------------
|
| 18 |
|
| 19 |
-
# important to know for CTC decoding - potentially language/model dependent
|
| 20 |
-
#model_word_separator = '|'
|
| 21 |
-
#model_blank_token = '[PAD]'
|
| 22 |
-
#is_MODEL_PATH="../models/LVL/wav2vec2-large-xlsr-53-icelandic-ep10-1000h"
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
self.model = Wav2Vec2ForCTC.from_pretrained(model_path)#.to(self.device)
|
| 33 |
-
self.processor = Wav2Vec2Processor.from_pretrained(model_path)
|
| 34 |
-
|
| 35 |
-
# build labels dict from a processor where it is not directly accessible
|
| 36 |
-
max_labels = 100 # any reasonable number higher than vocab + extra + special tokens in any language used
|
| 37 |
-
ixs = sorted(list(range(max_labels)),reverse=True)
|
| 38 |
-
self.labels_dict = {self.processor.tokenizer.decode(n) or model_word_separator:n for n in ixs}
|
| 39 |
-
|
| 40 |
-
self.blank_id = self.labels_dict[model_blank_token]
|
| 41 |
-
self.model_word_separator = model_word_separator
|
| 42 |
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
#------------------------------------------
|
| 48 |
-
# forced alignment with ctc decoder
|
| 49 |
-
# based on implementation of
|
| 50 |
-
# https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html
|
| 51 |
-
#------------------------------------------
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
# return the label class probability of each audio frame
|
| 55 |
-
# wav is the wav data already read in, NOT the file path.
|
| 56 |
-
def get_frame_probs(wav,aligner):
|
| 57 |
-
with torch.inference_mode(): # similar to with torch.no_grad():
|
| 58 |
-
input_values = aligner.processor(wav,sampling_rate=16000).input_values[0]
|
| 59 |
-
input_values = torch.tensor(input_values).unsqueeze(0)#, device=aligner.device).unsqueeze(0)
|
| 60 |
-
emits = aligner.model(input_values).logits
|
| 61 |
-
emits = torch.log_softmax(emits, dim=-1)
|
| 62 |
-
return emits[0].cpu().detach()
|
| 63 |
|
| 64 |
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
return trellis
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
@dataclass
|
| 86 |
-
class Point:
|
| 87 |
-
token_index: int
|
| 88 |
-
time_index: int
|
| 89 |
-
score: float
|
| 90 |
|
| 91 |
-
@dataclass
|
| 92 |
-
class Segment:
|
| 93 |
-
label: str
|
| 94 |
-
start: int
|
| 95 |
-
end: int
|
| 96 |
-
score: float
|
| 97 |
-
|
| 98 |
-
@property
|
| 99 |
-
def mfaform(self):
|
| 100 |
-
return f"{f2s(self.start)},{f2s(self.end)},{self.label}"
|
| 101 |
-
|
| 102 |
-
@property
|
| 103 |
-
def length(self):
|
| 104 |
-
return self.end - self.start
|
| 105 |
|
| 106 |
|
| 107 |
-
|
| 108 |
-
def backtrack(trellis, emission, tokens, blank_id):
|
| 109 |
# Note:
|
| 110 |
# j and t are indices for trellis, which has extra dimensions
|
| 111 |
# for time and tokens at the beginning.
|
|
@@ -113,112 +103,119 @@ def backtrack(trellis, emission, tokens, blank_id):
|
|
| 113 |
# the corresponding index in emission is `T-1`.
|
| 114 |
# Similarly, when referring to token index `J` in trellis,
|
| 115 |
# the corresponding index in transcript is `J-1`.
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
# 1. Figure out if the current position was stay or change
|
| 122 |
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
|
| 123 |
# Score for token staying the same from time frame J-1 to T.
|
| 124 |
-
|
| 125 |
# Score for token changing from C-1 at T-1 to J at T.
|
| 126 |
-
|
| 127 |
|
| 128 |
# 2. Store the path with frame-wise probability.
|
| 129 |
-
|
| 130 |
# Return token index and time index in non-trellis coordinate.
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
else:
|
| 139 |
-
raise ValueError("Failed to align")
|
| 140 |
-
return path[::-1]
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
def merge_repeats(path,transcript):
|
| 144 |
-
i1, i2 = 0, 0
|
| 145 |
-
segments = []
|
| 146 |
-
while i1 < len(path):
|
| 147 |
-
while i2 < len(path) and path[i1].token_index == path[i2].token_index: # while both path steps point to the same token index
|
| 148 |
-
i2 += 1
|
| 149 |
-
score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
|
| 150 |
-
segments.append( # when i2 finally switches to a different token,
|
| 151 |
-
Segment(
|
| 152 |
-
transcript[path[i1].token_index],# to the list of segments, append the token from i1
|
| 153 |
-
path[i1].time_index, # time of the first path-point of that token
|
| 154 |
-
path[i2 - 1].time_index + 1, # time of the final path-point for that token.
|
| 155 |
-
score,
|
| 156 |
-
)
|
| 157 |
-
)
|
| 158 |
-
i1 = i2
|
| 159 |
-
return segments
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
def merge_words(segments, separator):
|
| 164 |
-
words = []
|
| 165 |
-
i1, i2 = 0, 0
|
| 166 |
-
while i1 < len(segments):
|
| 167 |
-
if i2 >= len(segments) or segments[i2].label == separator:
|
| 168 |
-
if i1 != i2:
|
| 169 |
-
segs = segments[i1:i2]
|
| 170 |
-
word = "".join([seg.label for seg in segs])
|
| 171 |
-
score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
|
| 172 |
-
words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
|
| 173 |
-
i1 = i2 + 1
|
| 174 |
-
i2 = i1
|
| 175 |
else:
|
| 176 |
-
|
| 177 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
|
| 179 |
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
|
| 186 |
-
# generate mfa format for character (phone) and word alignments
|
| 187 |
-
# skip the word separator as it is not a phone
|
| 188 |
-
def mfalike(chars,wds,wsep):
|
| 189 |
-
hed = ['Begin,End,Label,Type,Speaker\n']
|
| 190 |
-
wlines = [f'{w.mfaform},words,000\n' for w in wds]
|
| 191 |
-
slines = [f'{ch.mfaform},phones,000\n' for ch in chars if ch.label != wsep]
|
| 192 |
-
return (''.join(hed+wlines+slines))
|
| 193 |
-
|
| 194 |
-
# generate basic exportable list format for character OR word alignments
|
| 195 |
-
# skip the word separator as it is not a phone
|
| 196 |
-
def basic(segs,wsep="|"):
|
| 197 |
-
return [[s.label,f2s(s.start),f2s(s.end)] for s in segs if s.label != wsep]
|
| 198 |
-
|
| 199 |
|
| 200 |
-
# needs pad labels added to correctly time first segment
|
| 201 |
-
# and therefore add word sep character as placeholder in transcript
|
| 202 |
-
def prep_transcript(xcp, aligner):
|
| 203 |
-
xcp = xcp.replace(' ', aligner.model_word_separator)
|
| 204 |
-
label_ids = [aligner.labels_dict[c] for c in xcp]
|
| 205 |
-
label_ids = [aligner.blank_id] + label_ids + [aligner.blank_id]
|
| 206 |
-
xcp = f'{ aligner.model_word_separator}{xcp}{aligner.model_word_separator}'
|
| 207 |
-
return xcp,label_ids
|
| 208 |
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
|
|
|
|
|
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
|
|
|
|
| 4 |
from dataclasses import dataclass
|
| 5 |
|
| 6 |
|
| 7 |
+
|
| 8 |
+
def aligner(model_path,model_word_separator = '|', model_blank_token = '[PAD]'):
|
| 9 |
|
| 10 |
+
# build labels dict from a processor where it is not directly accessible
|
| 11 |
+
def get_processor_labels(processor,word_sep,max_labels=100):
|
| 12 |
+
ixs = sorted(list(range(max_labels)),reverse=True)
|
| 13 |
+
return {processor.tokenizer.decode(n) or word_sep:n for n in ixs}
|
| 14 |
|
| 15 |
+
#------------------------------------------
|
| 16 |
+
# setup wav2vec2
|
| 17 |
+
#------------------------------------------
|
|
|
|
| 18 |
|
| 19 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 20 |
+
torch.random.manual_seed(0)
|
| 21 |
+
max_labels = 100 # any reasonable number higher than vocab + extra + special tokens in any language used
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
model = Wav2Vec2ForCTC.from_pretrained(model_path).to(device)
|
| 25 |
+
processor = Wav2Vec2Processor.from_pretrained(model_path)
|
| 26 |
+
labels_dict = get_processor_labels(processor,model_word_separator)
|
| 27 |
+
blank_id = labels_dict[model_blank_token]
|
| 28 |
|
| 29 |
|
| 30 |
+
#convert frame-numbers to timestamps in seconds
|
| 31 |
+
# w2v2 step size is about 20ms, or 50 frames per second
|
| 32 |
+
def f2s(fr):
|
| 33 |
+
return fr/50
|
| 34 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
+
#------------------------------------------
|
| 38 |
+
# forced alignment with ctc decoder
|
| 39 |
+
# based on implementation of
|
| 40 |
+
# https://pytorch.org/audio/main/tutorials/forced_alignment_tutorial.html
|
| 41 |
+
#------------------------------------------
|
| 42 |
|
| 43 |
|
| 44 |
+
# return the label class probability of each audio frame
|
| 45 |
+
# wav is the wav data already read in, NOT the file path.
|
| 46 |
+
def get_frame_probs(wav):
|
| 47 |
+
with torch.inference_mode(): # similar to with torch.no_grad():
|
| 48 |
+
input_values = processor(wav,sampling_rate=16000).input_values[0]
|
| 49 |
+
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
|
| 50 |
+
emits = model(input_values).logits
|
| 51 |
+
emits = torch.log_softmax(emits, dim=-1)
|
| 52 |
+
return emits[0].cpu().detach()
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
def get_trellis(emission, tokens, blank_id):
|
| 56 |
+
|
| 57 |
+
num_frame = emission.size(0)
|
| 58 |
+
num_tokens = len(tokens)
|
| 59 |
+
trellis = torch.empty((num_frame + 1, num_tokens + 1))
|
| 60 |
+
trellis[0, 0] = 0
|
| 61 |
+
trellis[1:, 0] = torch.cumsum(emission[:, 0], 0) # len of this slice of trellis is len of audio frames)
|
| 62 |
+
trellis[0, -num_tokens:] = -float("inf") # len of this slice of trellis is len of transcript tokens
|
| 63 |
+
trellis[-num_tokens:, 0] = float("inf")
|
| 64 |
+
for t in range(num_frame):
|
| 65 |
+
trellis[t + 1, 1:] = torch.maximum(
|
| 66 |
+
# Score for staying at the same token
|
| 67 |
+
trellis[t, 1:] + emission[t, blank_id],
|
| 68 |
+
# Score for changing to the next token
|
| 69 |
+
trellis[t, :-1] + emission[t, tokens],
|
| 70 |
+
)
|
| 71 |
+
return trellis
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
+
@dataclass
|
| 76 |
+
class Point:
|
| 77 |
+
token_index: int
|
| 78 |
+
time_index: int
|
| 79 |
+
score: float
|
| 80 |
|
| 81 |
+
@dataclass
|
| 82 |
+
class Segment:
|
| 83 |
+
label: str
|
| 84 |
+
start: int
|
| 85 |
+
end: int
|
| 86 |
+
score: float
|
| 87 |
+
|
| 88 |
+
@property
|
| 89 |
+
def mfaform(self):
|
| 90 |
+
return f"{f2s(self.start)},{f2s(self.end)},{self.label}"
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def length(self):
|
| 94 |
+
return self.end - self.start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
|
| 98 |
+
def backtrack(trellis, emission, tokens, blank_id):
|
|
|
|
| 99 |
# Note:
|
| 100 |
# j and t are indices for trellis, which has extra dimensions
|
| 101 |
# for time and tokens at the beginning.
|
|
|
|
| 103 |
# the corresponding index in emission is `T-1`.
|
| 104 |
# Similarly, when referring to token index `J` in trellis,
|
| 105 |
# the corresponding index in transcript is `J-1`.
|
| 106 |
+
j = trellis.size(1) - 1
|
| 107 |
+
t_start = torch.argmax(trellis[:, j]).item()
|
| 108 |
|
| 109 |
+
path = []
|
| 110 |
+
for t in range(t_start, 0, -1):
|
| 111 |
# 1. Figure out if the current position was stay or change
|
| 112 |
# `emission[J-1]` is the emission at time frame `J` of trellis dimension.
|
| 113 |
# Score for token staying the same from time frame J-1 to T.
|
| 114 |
+
stayed = trellis[t - 1, j] + emission[t - 1, blank_id]
|
| 115 |
# Score for token changing from C-1 at T-1 to J at T.
|
| 116 |
+
changed = trellis[t - 1, j - 1] + emission[t - 1, tokens[j - 1]]
|
| 117 |
|
| 118 |
# 2. Store the path with frame-wise probability.
|
| 119 |
+
prob = emission[t - 1, tokens[j - 1] if changed > stayed else 0].exp().item()
|
| 120 |
# Return token index and time index in non-trellis coordinate.
|
| 121 |
+
path.append(Point(j - 1, t - 1, prob))
|
| 122 |
+
|
| 123 |
+
# 3. Update the token
|
| 124 |
+
if changed > stayed:
|
| 125 |
+
j -= 1
|
| 126 |
+
if j == 0:
|
| 127 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
else:
|
| 129 |
+
raise ValueError("Failed to align")
|
| 130 |
+
return path[::-1]
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def merge_repeats(path,transcript):
|
| 134 |
+
i1, i2 = 0, 0
|
| 135 |
+
segments = []
|
| 136 |
+
while i1 < len(path):
|
| 137 |
+
while i2 < len(path) and path[i1].token_index == path[i2].token_index: # while both path steps point to the same token index
|
| 138 |
+
i2 += 1
|
| 139 |
+
score = sum(path[k].score for k in range(i1, i2)) / (i2 - i1)
|
| 140 |
+
segments.append( # when i2 finally switches to a different token,
|
| 141 |
+
Segment(
|
| 142 |
+
transcript[path[i1].token_index],# to the list of segments, append the token from i1
|
| 143 |
+
path[i1].time_index, # time of the first path-point of that token
|
| 144 |
+
path[i2 - 1].time_index + 1, # time of the final path-point for that token.
|
| 145 |
+
score,
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
i1 = i2
|
| 149 |
+
return segments
|
| 150 |
|
| 151 |
|
| 152 |
|
| 153 |
+
def merge_words(segments, separator):
|
| 154 |
+
words = []
|
| 155 |
+
i1, i2 = 0, 0
|
| 156 |
+
while i1 < len(segments):
|
| 157 |
+
if i2 >= len(segments) or segments[i2].label == separator:
|
| 158 |
+
if i1 != i2:
|
| 159 |
+
segs = segments[i1:i2]
|
| 160 |
+
word = "".join([seg.label for seg in segs])
|
| 161 |
+
score = sum(seg.score * seg.length for seg in segs) / sum(seg.length for seg in segs)
|
| 162 |
+
words.append(Segment(word, segments[i1].start, segments[i2 - 1].end, score))
|
| 163 |
+
i1 = i2 + 1
|
| 164 |
+
i2 = i1
|
| 165 |
+
else:
|
| 166 |
+
i2 += 1
|
| 167 |
+
return words
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
+
#------------------------------------------
|
| 173 |
+
# handle etc.
|
| 174 |
+
#------------------------------------------
|
| 175 |
|
| 176 |
|
| 177 |
+
# generate mfa format for character (phone) and word alignments
|
| 178 |
+
# skip the word separator as it is not a phone
|
| 179 |
+
def mfalike(chars,wds,wsep):
|
| 180 |
+
hed = ['Begin,End,Label,Type,Speaker\n']
|
| 181 |
+
wlines = [f'{w.mfaform},words,000\n' for w in wds]
|
| 182 |
+
slines = [f'{ch.mfaform},phones,000\n' for ch in chars if ch.label != wsep]
|
| 183 |
+
return (''.join(hed+wlines+slines))
|
| 184 |
|
| 185 |
+
# generate basic exportable list format for character OR word alignments
|
| 186 |
+
# skip the word separator as it is not a phone
|
| 187 |
+
def basic(segs,wsep="|"):
|
| 188 |
+
return [[s.label,f2s(s.start),f2s(s.end)] for s in segs if s.label != wsep]
|
| 189 |
|
| 190 |
+
|
| 191 |
+
# needs pad labels added to correctly time first segment
|
| 192 |
+
# and therefore add word sep character as placeholder in transcript
|
| 193 |
+
def prep_transcript(xcp):
|
| 194 |
+
xcp = xcp.replace(' ',model_word_separator)
|
| 195 |
+
label_ids = [labels_dict[c] for c in xcp]
|
| 196 |
+
label_ids = [blank_id] + label_ids + [blank_id]
|
| 197 |
+
xcp = f'{model_word_separator}{xcp}{model_word_separator}'
|
| 198 |
+
return xcp,label_ids
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _align(wav_data,transcript):
|
| 203 |
+
|
| 204 |
+
norm_transcript,rec_label_ids = prep_transcript(transcript)
|
| 205 |
+
emit = get_frame_probs(wav_data)
|
| 206 |
+
trellis = get_trellis(emit, rec_label_ids, blank_id)
|
| 207 |
+
path = backtrack(trellis, emit, rec_label_ids, blank_id)
|
| 208 |
+
|
| 209 |
+
segments = merge_repeats(path,norm_transcript)
|
| 210 |
+
words = merge_words(segments, model_word_separator)
|
| 211 |
+
|
| 212 |
+
#segments = [s for s in segments if s[0] != model_word_separator]
|
| 213 |
+
#return mfalike(segments,words,model_word_separator)
|
| 214 |
+
return basic(words,model_word_separator), basic(segments,model_word_separator)
|
| 215 |
+
|
| 216 |
+
return _align
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
|
| 220 |
|
| 221 |
|
graph.py
CHANGED
|
@@ -4,7 +4,6 @@ from scipy import signal
|
|
| 4 |
import librosa
|
| 5 |
import subprocess
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
-
import ctcalign
|
| 8 |
|
| 9 |
|
| 10 |
|
|
@@ -44,13 +43,12 @@ def get_pitch_tracks(wav_path):
|
|
| 44 |
# transcript could be from a corpus with the wav file,
|
| 45 |
# input by the user,
|
| 46 |
# or from a previous speech recognition process
|
| 47 |
-
def align_and_graph(wav_path, transcript,
|
| 48 |
|
| 49 |
# fetch data
|
| 50 |
-
#f0_data = get_pitch_tracks(wav_path)
|
| 51 |
speech = readwav(wav_path)
|
| 52 |
-
w_align, seg_align =
|
| 53 |
-
|
| 54 |
|
| 55 |
# set up the graph shape
|
| 56 |
rec_start = w_align[0][1]
|
|
|
|
| 4 |
import librosa
|
| 5 |
import subprocess
|
| 6 |
import matplotlib.pyplot as plt
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
|
|
|
|
| 43 |
# transcript could be from a corpus with the wav file,
|
| 44 |
# input by the user,
|
| 45 |
# or from a previous speech recognition process
|
| 46 |
+
def align_and_graph(wav_path, transcript, aligner_function):
|
| 47 |
|
| 48 |
# fetch data
|
|
|
|
| 49 |
speech = readwav(wav_path)
|
| 50 |
+
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
|
| 51 |
+
|
| 52 |
|
| 53 |
# set up the graph shape
|
| 54 |
rec_start = w_align[0][1]
|