Miguel Jaramillo commited on
Commit
3bb297d
·
unverified ·
1 Parent(s): 09d2732

Add files via upload

Browse files
Files changed (1) hide show
  1. tp3__1__1.py +501 -0
tp3__1__1.py ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """tp3__1_-1.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_Sjx5G1BW689ggZJAJ4P7kCZndOobNCp
8
+ """
9
+
10
+ # Install Gradio
11
+ !pip install gradio -q
12
+
13
+ # Install timidy
14
+ !sudo apt-get install -q -y timidity libsndfile1
15
+
16
+ # All the imports to deal with sound data
17
+ !pip install pydub numba==0.48 librosa music21
18
+
19
+ # Import Libraries
20
+
21
+ import gradio as gr
22
+ import time
23
+
24
+ import tensorflow as tf
25
+ import tensorflow_hub as hub
26
+
27
+ import numpy as np
28
+ import matplotlib.pyplot as plt
29
+ import librosa
30
+ from librosa import display as librosadisplay
31
+
32
+ import logging
33
+ import math
34
+ import statistics
35
+ import sys
36
+
37
+ from IPython.display import Audio, Javascript
38
+ from scipy.io import wavfile
39
+
40
+ from base64 import b64decode
41
+
42
+ import music21
43
+ from pydub import AudioSegment
44
+
45
+ logger = logging.getLogger()
46
+ logger.setLevel(logging.ERROR)
47
+
48
+ #print("tensorflow: %s" % tf.__version__)
49
+ #print("librosa: %s" % librosa.__version__)
50
+
51
+ # The audio input file
52
+ # Now the hardest part: Record your singing! :)
53
+
54
+ # We provide four methods to obtain an audio file:
55
+
56
+ # 1. Record audio directly in Gradio
57
+ # 2. Use a file saved on Google Drive
58
+
59
+ # Use a file saved on Google Drive
60
+ INPUT_SOURCE = 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav'
61
+
62
+ !wget --no-check-certificate 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav' -O c-scale.wav
63
+
64
+ uploaded_file_name = 'c-scale.wav'
65
+
66
+ uploaded_file_name
67
+
68
+ # Function that converts the user-created audio to the format that the model
69
+ # expects: bitrate 16kHz and only one channel (mono).
70
+
71
+ EXPECTED_SAMPLE_RATE = 16000
72
+
73
+ def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'):
74
+ audio = AudioSegment.from_file(user_file)
75
+ audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
76
+ audio.export(output_file, format="wav")
77
+ return output_file
78
+
79
+ MAX_ABS_INT16 = 32768.0
80
+
81
+ def plot_stft(x, sample_rate, show_black_and_white=False):
82
+ x_stft = np.abs(librosa.stft(x, n_fft=2048))
83
+ fig, ax = plt.subplots()
84
+ fig.set_size_inches(20, 10)
85
+ x_stft_db = librosa.amplitude_to_db(x_stft, ref=np.max)
86
+
87
+ if(show_black_and_white):
88
+ librosadisplay.specshow(data=x_stft_db,
89
+ y_axis='log',
90
+ sr=sample_rate,
91
+ cmap='gray_r')
92
+ else:
93
+ librosadisplay.specshow(data=x_stft_db,
94
+ y_axis='log',
95
+ sr=sample_rate)
96
+
97
+ plt.colorbar(format='%+2.0f dB')
98
+
99
+ return fig
100
+
101
+ # Loading audio samples from the wav file:
102
+ sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')
103
+
104
+ fig = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE)
105
+
106
+ # Executing the Model
107
+ # Loading the SPICE model is easy:
108
+ model = hub.load("https://tfhub.dev/google/spice/2")
109
+
110
+ def plot_pitch_conf(pitch_outputs,confidence_outputs):
111
+ fig, ax = plt.subplots()
112
+ fig.set_size_inches(20, 10)
113
+ plt.plot(pitch_outputs, label='pitch')
114
+ plt.plot(confidence_outputs, label='confidence')
115
+ plt.legend(loc="lower right")
116
+ return fig
117
+
118
+ def plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y):
119
+ fig, ax = plt.subplots()
120
+ fig.set_size_inches(20, 10)
121
+ ax.set_ylim([0, 1])
122
+ plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, )
123
+ plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, c="r")
124
+ return fig
125
+
126
+ def output2hz(pitch_output):
127
+ # Constants taken from https://tfhub.dev/google/spice/2
128
+ PT_OFFSET = 25.58
129
+ PT_SLOPE = 63.07
130
+ FMIN = 10.0;
131
+ BINS_PER_OCTAVE = 12.0;
132
+ cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET;
133
+ return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)
134
+
135
+ def espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz):
136
+ fig, ax = plt.subplots()
137
+ plot_stft(audio_samples / MAX_ABS_INT16 ,
138
+ sample_rate=EXPECTED_SAMPLE_RATE, show_black_and_white=True)
139
+ # Note: conveniently, since the plot is in log scale, the pitch outputs
140
+ # also get converted to the log scale automatically by matplotlib.
141
+ plt.scatter(confident_pitch_outputs_x, confident_pitch_values_hz, c="r")
142
+ return fig
143
+
144
+ def hz2offset(freq):
145
+ # This measures the quantization error for a single note.
146
+ if freq == 0: # Rests always have zero error.
147
+ return None
148
+ # Quantized note.
149
+ h = round(12 * math.log2(freq / C0))
150
+ return 12 * math.log2(freq / C0) - h
151
+
152
+ def quantize_predictions(group, ideal_offset):
153
+ # Group values are either 0, or a pitch in Hz.
154
+ non_zero_values = [v for v in group if v != 0]
155
+ zero_values_count = len(group) - len(non_zero_values)
156
+
157
+ # Create a rest if 80% is silent, otherwise create a note.
158
+ if zero_values_count > 0.8 * len(group):
159
+ # Interpret as a rest. Count each dropped note as an error, weighted a bit
160
+ # worse than a badly sung note (which would 'cost' 0.5).
161
+ return 0.51 * len(non_zero_values), "Rest"
162
+ else:
163
+ # Interpret as note, estimating as mean of non-rest predictions.
164
+ h = round(
165
+ statistics.mean([
166
+ 12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values
167
+ ]))
168
+ octave = h // 12
169
+ n = h % 12
170
+ note = note_names[n] + str(octave)
171
+ # Quantization error is the total difference from the quantized note.
172
+ error = sum([
173
+ abs(12 * math.log2(freq / C0) - ideal_offset - h)
174
+ for freq in non_zero_values
175
+ ])
176
+ return error, note
177
+
178
+ def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth,
179
+ prediction_start_offset, ideal_offset):
180
+ # Apply the start offset - we can just add the offset as rests.
181
+ pitch_outputs_and_rests = [0] * prediction_start_offset + \
182
+ pitch_outputs_and_rests
183
+ # Collect the predictions for each note (or rest).
184
+ groups = [
185
+ pitch_outputs_and_rests[i:i + predictions_per_eighth]
186
+ for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
187
+ ]
188
+
189
+ quantization_error = 0
190
+
191
+ notes_and_rests = []
192
+ for group in groups:
193
+ error, note_or_rest = quantize_predictions(group, ideal_offset)
194
+ quantization_error += error
195
+ notes_and_rests.append(note_or_rest)
196
+
197
+ return quantization_error, notes_and_rests
198
+
199
+ def main(audio):
200
+
201
+ # Preparing the audio data
202
+ # Now we have the audio, let's convert it to the expected format and then
203
+ # listen to it!
204
+ # The SPICE model needs as input an audio file at a sampling rate of 16kHz and
205
+ # with only one channel (mono).
206
+ # To help you with this part, we created a function(`convert_audio_for_model`)
207
+ #to convert any wav file you have to the model's expected format:
208
+
209
+
210
+ # Converting to the expected format for the model
211
+ # in all the input 4 input method before, the uploaded file name is at
212
+ # the variable uploaded_file_name
213
+ converted_audio_file = convert_audio_for_model(audio)
214
+
215
+ # Loading audio samples from the wav file:
216
+ sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')
217
+
218
+ audio_samples = audio_samples / float(MAX_ABS_INT16)
219
+
220
+
221
+ # We now feed the audio to the SPICE tf.hub model to obtain pitch and uncertainty outputs as tensors.
222
+ model_output = model.signatures["serving_default"](tf.constant(audio_samples, tf.float32))
223
+
224
+ pitch_outputs = model_output["pitch"]
225
+ uncertainty_outputs = model_output["uncertainty"]
226
+
227
+ # 'Uncertainty' basically means the inverse of confidence.
228
+ confidence_outputs = 1.0 - uncertainty_outputs
229
+
230
+
231
+ confidence_outputs = list(confidence_outputs)
232
+ pitch_outputs = [ float(x) for x in pitch_outputs]
233
+
234
+ indices = range(len (pitch_outputs))
235
+ confident_pitch_outputs = [ (i,p)
236
+ for i, p, c in zip(indices, pitch_outputs, confidence_outputs) if c >= 0.9 ]
237
+ confident_pitch_outputs_x, confident_pitch_outputs_y = zip(*confident_pitch_outputs)
238
+
239
+ confident_pitch_values_hz = [ output2hz(p) for p in confident_pitch_outputs_y ]
240
+
241
+
242
+ #Plot waves
243
+ fig1 = plt.figure()
244
+ plt.plot(audio_samples)
245
+
246
+ #Plot
247
+ fig2 = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE)
248
+
249
+ #Plot Pitch & Confidence
250
+ fig3 = plot_pitch_conf(pitch_outputs,confidence_outputs)
251
+
252
+
253
+ #Plot Pitch & Confidence Notes
254
+ fig4 = plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y)
255
+
256
+ #Plot Espectro + Notes
257
+ fig5 = espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz)
258
+
259
+
260
+ # ############################################################################
261
+ # Converting to musical notes ################################################
262
+
263
+ # Now that we have the pitch values, let's convert them to notes!
264
+ # This is part is challenging by itself. We have to take into account two
265
+ # things:
266
+ # 1. the rests (when there's no singing)
267
+ # 2. the size of each note (offsets)
268
+
269
+ # ----------------------------------------------------------------------------
270
+ ### 1: Adding zeros to the output to indicate when there's no singing
271
+
272
+ pitch_outputs_and_rests = [
273
+ output2hz(p) if c >= 0.9 else 0
274
+ for i, p, c in zip(indices, pitch_outputs, confidence_outputs)
275
+ ]
276
+
277
+ # ----------------------------------------------------------------------------
278
+ ### 2: Adding note offsets
279
+ # When a person sings freely, the melody may have an offset to the absolute
280
+ # pitch values that notes can represent.
281
+ # Hence, to convert predictions to notes, one needs to correct for this
282
+ # possible offset.
283
+ # This is what the following code computes.
284
+
285
+ A4 = 440
286
+ C0 = A4 * pow(2, -4.75)
287
+ note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
288
+
289
+ def hz2offset(freq):
290
+ # This measures the quantization error for a single note.
291
+ if freq == 0: # Rests always have zero error.
292
+ return None
293
+ # Quantized note.
294
+ h = round(12 * math.log2(freq / C0))
295
+ return 12 * math.log2(freq / C0) - h
296
+
297
+
298
+ # The ideal offset is the mean quantization error for all the notes
299
+ # (excluding rests):
300
+ offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0]
301
+ #print("offsets: ", offsets)
302
+ off = offsets
303
+
304
+ ideal_offset = statistics.mean(offsets)
305
+ #print("ideal offset: ", ideal_offset)
306
+ ideal_off = ideal_offset
307
+
308
+ # We can now use some heuristics to try and estimate the most likely sequence
309
+ # of notes that were sung.
310
+ # The ideal offset computed above is one ingredient - but we also need to know
311
+ # the speed (how many predictions make, say, an eighth?), and the time offset
312
+ # to start quantizing. To keep it simple, we'll just try different speeds and
313
+ # time offsets and measure the quantization error, using in the end the values
314
+ # that minimize this error.
315
+
316
+ def quantize_predictions(group, ideal_offset):
317
+ # Group values are either 0, or a pitch in Hz.
318
+ non_zero_values = [v for v in group if v != 0]
319
+ zero_values_count = len(group) - len(non_zero_values)
320
+
321
+ # Create a rest if 80% is silent, otherwise create a note.
322
+ if zero_values_count > 0.8 * len(group):
323
+ # Interpret as a rest. Count each dropped note as an error, weighted a bit
324
+ # worse than a badly sung note (which would 'cost' 0.5).
325
+ return 0.51 * len(non_zero_values), "Rest"
326
+ else:
327
+ # Interpret as note, estimating as mean of non-rest predictions.
328
+ h = round(
329
+ statistics.mean([
330
+ 12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values
331
+ ]))
332
+ octave = h // 12
333
+ n = h % 12
334
+ note = note_names[n] + str(octave)
335
+ # Quantization error is the total difference from the quantized note.
336
+ error = sum([
337
+ abs(12 * math.log2(freq / C0) - ideal_offset - h)
338
+ for freq in non_zero_values
339
+ ])
340
+ return error, note
341
+
342
+
343
+ def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth,
344
+ prediction_start_offset, ideal_offset):
345
+ # Apply the start offset - we can just add the offset as rests.
346
+ pitch_outputs_and_rests = [0] * prediction_start_offset + \
347
+ pitch_outputs_and_rests
348
+ # Collect the predictions for each note (or rest).
349
+ groups = [
350
+ pitch_outputs_and_rests[i:i + predictions_per_eighth]
351
+ for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
352
+ ]
353
+
354
+ quantization_error = 0
355
+
356
+ notes_and_rests = []
357
+ for group in groups:
358
+ error, note_or_rest = quantize_predictions(group, ideal_offset)
359
+ quantization_error += error
360
+ notes_and_rests.append(note_or_rest)
361
+
362
+ return quantization_error, notes_and_rests
363
+
364
+
365
+ best_error = float("inf")
366
+ best_notes_and_rests = None
367
+ best_predictions_per_note = None
368
+
369
+ for predictions_per_note in range(20, 65, 1):
370
+ for prediction_start_offset in range(predictions_per_note):
371
+
372
+ error, notes_and_rests = get_quantization_and_error(
373
+ pitch_outputs_and_rests, predictions_per_note,
374
+ prediction_start_offset, ideal_offset)
375
+
376
+ if error < best_error:
377
+ best_error = error
378
+ best_notes_and_rests = notes_and_rests
379
+ best_predictions_per_note = predictions_per_note
380
+
381
+ # At this point, best_notes_and_rests contains the best quantization.
382
+ # Since we don't need to have rests at the beginning, let's remove these:
383
+ while best_notes_and_rests[0] == 'Rest':
384
+ best_notes_and_rests = best_notes_and_rests[1:]
385
+ # Also remove silence at the end.
386
+ while best_notes_and_rests[-1] == 'Rest':
387
+ best_notes_and_rests = best_notes_and_rests[:-1]
388
+
389
+ # ____________________________________________________________________________
390
+ # Now let's write the quantized notes as sheet music score!
391
+ # To do it we will use two libraries: [music21](http://web.mit.edu/music21/) and
392
+ # [Open Sheet Music Display](https://github.com/opensheetmusicdisplay/opensheetmusicdisplay)
393
+ # **Note:** for simplicity, we assume here that all notes have the same duration
394
+ # (a half note).
395
+
396
+ # Creating the sheet music score.
397
+ sc = music21.stream.Score()
398
+ # Adjust the speed to match the actual singing.
399
+ bpm = 60 * 60 / best_predictions_per_note
400
+ #print ('bpm: ', bpm)
401
+ a = music21.tempo.MetronomeMark(number=bpm)
402
+ sc.insert(0,a)
403
+
404
+ for snote in best_notes_and_rests:
405
+ d = 'half'
406
+ if snote == 'Rest':
407
+ sc.append(music21.note.Rest(type=d))
408
+ else:
409
+ sc.append(music21.note.Note(snote, type=d))
410
+
411
+
412
+ # @title [Run this] Helper function to use Open Sheet Music Display (JS code)
413
+ # to show a music score
414
+ from IPython.core.display import display, HTML, Javascript
415
+ import json, random
416
+
417
+ def showScore(score):
418
+ xml = open(score.write('musicxml')).read()
419
+ showMusicXML(xml)
420
+
421
+ def showMusicXML(xml):
422
+ DIV_ID = "OSMD_div"
423
+ a = display(HTML('<div id="'+DIV_ID+'">loading OpenSheetMusicDisplay</div>'))
424
+ script = """
425
+ var div_id = {{DIV_ID}};
426
+ function loadOSMD() {
427
+ return new Promise(function(resolve, reject){
428
+ if (window.opensheetmusicdisplay) {
429
+ return resolve(window.opensheetmusicdisplay)
430
+ }
431
+ // OSMD script has a 'define' call which conflicts with requirejs
432
+ var _define = window.define // save the define object
433
+ window.define = undefined // now the loaded script will ignore requirejs
434
+ var s = document.createElement( 'script' );
435
+ s.setAttribute( 'src', "https://cdn.jsdelivr.net/npm/opensheetmusicdisplay@0.7.6/build/opensheetmusicdisplay.min.js" );
436
+ //s.setAttribute( 'src', "/custom/opensheetmusicdisplay.js" );
437
+ s.onload=function(){
438
+ window.define = _define
439
+ resolve(opensheetmusicdisplay);
440
+ };
441
+ document.body.appendChild( s ); // browser will try to load the new script tag
442
+ })
443
+ }
444
+ loadOSMD().then((OSMD)=>{
445
+ window.openSheetMusicDisplay = new OSMD.OpenSheetMusicDisplay(div_id, {
446
+ drawingParameters: "compacttight"
447
+ });
448
+ openSheetMusicDisplay
449
+ .load({{data}})
450
+ .then(
451
+ function() {
452
+ openSheetMusicDisplay.render();
453
+ }
454
+ );
455
+ })
456
+ """.replace('{{DIV_ID}}',DIV_ID).replace('{{data}}',json.dumps(xml))
457
+ #display(Javascript(script))
458
+ return a
459
+
460
+ # rendering the music score
461
+ partitura = showScore(sc)
462
+ #print(best_notes_and_rests)
463
+
464
+
465
+
466
+ # ____________________________________________________________________________
467
+ # Let's convert the music notes to a MIDI file and listen to it.
468
+ # To create this file, we can use the stream we created before.
469
+
470
+ # Saving the recognized musical notes as a MIDI file
471
+ converted_audio_file_as_midi = converted_audio_file[:-4] + '.mid'
472
+ fp = sc.write('midi', fp=converted_audio_file_as_midi)
473
+
474
+ wav_from_created_midi = converted_audio_file_as_midi.replace(' ', '_') + "_midioutput.wav"
475
+ #print(wav_from_created_midi)
476
+
477
+ # To listen to it on colab, we need to convert it back to wav. An easy way of
478
+ # doing that is using Timidity.
479
+
480
+ !timidity $converted_audio_file_as_midi -Ow -o $wav_from_created_midi
481
+
482
+ return converted_audio_file, fig1, fig2, fig3, fig4,fig5, bpm, best_notes_and_rests, partitura, wav_from_created_midi
483
+
484
+ iface = gr.Interface(
485
+ fn=main,
486
+ inputs = [gr.inputs.Audio(source= "microphone" , type="filepath",label="Ingrese Audio")],
487
+ outputs= [gr.outputs.Audio(label="Audio Original"),
488
+ gr.outputs.Plot(type="auto",label="Gráfico de Frecuencias"),
489
+ gr.outputs.Plot(type="auto",label="Especto"),
490
+ gr.outputs.Plot(type="auto",label="Pitch Confidence"),
491
+ gr.outputs.Plot(type="auto",label="Notas"),
492
+ gr.outputs.Plot(type="auto",label="Espectro+Notas"),
493
+ gr.outputs.Textbox(label="bpm"),
494
+ gr.outputs.Textbox(label="partitura"),
495
+ gr.outputs.Textbox(type="html",label="partitura1"),
496
+ gr.outputs.Audio(label="midi")],
497
+ examples=[[uploaded_file_name]],
498
+ interpretation = "default",
499
+ )
500
+
501
+ iface.launch(debug=True)