Spaces:

FrankAst
/

Spice

Runtime error

App Files Files Community

Miguel Jaramillo commited on May 3, 2022

Commit

3bb297d

unverified ·

1 Parent(s): 09d2732

Add files via upload

Browse files

Files changed (1) hide show

tp3__1__1.py +501 -0

tp3__1__1.py ADDED Viewed

	@@ -0,0 +1,501 @@

+# -*- coding: utf-8 -*-
+"""tp3__1_-1.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1_Sjx5G1BW689ggZJAJ4P7kCZndOobNCp
+"""
+# Install Gradio
+!pip install gradio -q
+# Install timidy
+!sudo apt-get install -q -y timidity libsndfile1
+# All the imports to deal with sound data
+!pip install pydub numba==0.48 librosa music21
+# Import Libraries
+import gradio as gr
+import time
+import tensorflow as tf
+import tensorflow_hub as hub
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa
+from librosa import display as librosadisplay
+import logging
+import math
+import statistics
+import sys
+from IPython.display import Audio, Javascript
+from scipy.io import wavfile
+from base64 import b64decode
+import music21
+from pydub import AudioSegment
+logger = logging.getLogger()
+logger.setLevel(logging.ERROR)
+#print("tensorflow: %s" % tf.__version__)
+#print("librosa: %s" % librosa.__version__)
+# The audio input file
+# Now the hardest part: Record your singing! :)
+# We provide four methods to obtain an audio file:
+# 1.   Record audio directly in Gradio
+# 2.   Use a file saved on Google Drive
+# Use a file saved on Google Drive
+INPUT_SOURCE = 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav'
+!wget --no-check-certificate 'https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav' -O c-scale.wav
+uploaded_file_name = 'c-scale.wav'
+uploaded_file_name
+# Function that converts the user-created audio to the format that the model
+# expects: bitrate 16kHz and only one channel (mono).
+EXPECTED_SAMPLE_RATE = 16000
+def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'):
+  audio = AudioSegment.from_file(user_file)
+  audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
+  audio.export(output_file, format="wav")
+  return output_file
+MAX_ABS_INT16 = 32768.0
+def plot_stft(x, sample_rate, show_black_and_white=False):
+  x_stft = np.abs(librosa.stft(x, n_fft=2048))
+  fig, ax = plt.subplots()
+  fig.set_size_inches(20, 10)
+  x_stft_db = librosa.amplitude_to_db(x_stft, ref=np.max)
+  if(show_black_and_white):
+    librosadisplay.specshow(data=x_stft_db,
+                            y_axis='log',
+                            sr=sample_rate,
+                            cmap='gray_r')
+  else:
+    librosadisplay.specshow(data=x_stft_db,
+                            y_axis='log',
+                            sr=sample_rate)
+  plt.colorbar(format='%+2.0f dB')
+  return fig
+# Loading audio samples from the wav file:
+sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')
+fig = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE)
+# Executing the Model
+# Loading the SPICE model is easy:
+model = hub.load("https://tfhub.dev/google/spice/2")
+def plot_pitch_conf(pitch_outputs,confidence_outputs):
+  fig, ax = plt.subplots()
+  fig.set_size_inches(20, 10)
+  plt.plot(pitch_outputs, label='pitch')
+  plt.plot(confidence_outputs, label='confidence')
+  plt.legend(loc="lower right")
+  return fig
+def plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y):
+  fig, ax = plt.subplots()
+  fig.set_size_inches(20, 10)
+  ax.set_ylim([0, 1])
+  plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, )
+  plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, c="r")
+  return fig
+def output2hz(pitch_output):
+  # Constants taken from https://tfhub.dev/google/spice/2
+  PT_OFFSET = 25.58
+  PT_SLOPE = 63.07
+  FMIN = 10.0;
+  BINS_PER_OCTAVE = 12.0;
+  cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET;
+  return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE)
+def espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz):
+  fig, ax = plt.subplots()
+  plot_stft(audio_samples / MAX_ABS_INT16 ,
+            sample_rate=EXPECTED_SAMPLE_RATE, show_black_and_white=True)
+  # Note: conveniently, since the plot is in log scale, the pitch outputs
+  # also get converted to the log scale automatically by matplotlib.
+  plt.scatter(confident_pitch_outputs_x, confident_pitch_values_hz, c="r")
+  return fig
+def hz2offset(freq):
+    # This measures the quantization error for a single note.
+    if freq == 0:  # Rests always have zero error.
+      return None
+    # Quantized note.
+    h = round(12 * math.log2(freq / C0))
+    return 12 * math.log2(freq / C0) - h
+def quantize_predictions(group, ideal_offset):
+  # Group values are either 0, or a pitch in Hz.
+  non_zero_values = [v for v in group if v != 0]
+  zero_values_count = len(group) - len(non_zero_values)
+  # Create a rest if 80% is silent, otherwise create a note.
+  if zero_values_count > 0.8 * len(group):
+    # Interpret as a rest. Count each dropped note as an error, weighted a bit
+    # worse than a badly sung note (which would 'cost' 0.5).
+    return 0.51 * len(non_zero_values), "Rest"
+  else:
+    # Interpret as note, estimating as mean of non-rest predictions.
+    h = round(
+        statistics.mean([
+            12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values
+        ]))
+    octave = h // 12
+    n = h % 12
+    note = note_names[n] + str(octave)
+    # Quantization error is the total difference from the quantized note.
+    error = sum([
+        abs(12 * math.log2(freq / C0) - ideal_offset - h)
+        for freq in non_zero_values
+    ])
+    return error, note
+def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth,
+                               prediction_start_offset, ideal_offset):
+  # Apply the start offset - we can just add the offset as rests.
+  pitch_outputs_and_rests = [0] * prediction_start_offset + \
+                            pitch_outputs_and_rests
+  # Collect the predictions for each note (or rest).
+  groups = [
+      pitch_outputs_and_rests[i:i + predictions_per_eighth]
+      for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
+  ]
+  quantization_error = 0
+  notes_and_rests = []
+  for group in groups:
+    error, note_or_rest = quantize_predictions(group, ideal_offset)
+    quantization_error += error
+    notes_and_rests.append(note_or_rest)
+  return quantization_error, notes_and_rests
+def main(audio):
+  # Preparing the audio data
+  # Now  we  have the  audio,  let's  convert it to the expected format and then
+  # listen to it!
+  # The SPICE model needs as input an audio file at a sampling rate of 16kHz and
+  # with only one channel (mono).
+  # To help you with this part, we created a function(`convert_audio_for_model`)
+  #to convert any wav file you have to the model's expected format:
+  # Converting to the expected format for the model
+  # in all the input 4 input method before, the uploaded file name is at
+  # the variable uploaded_file_name
+  converted_audio_file = convert_audio_for_model(audio)
+  # Loading audio samples from the wav file:
+  sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')
+  audio_samples = audio_samples / float(MAX_ABS_INT16)
+  # We now feed the audio to the SPICE tf.hub model to obtain pitch and uncertainty outputs as tensors.
+  model_output = model.signatures["serving_default"](tf.constant(audio_samples, tf.float32))
+  pitch_outputs = model_output["pitch"]
+  uncertainty_outputs = model_output["uncertainty"]
+  # 'Uncertainty' basically means the inverse of confidence.
+  confidence_outputs = 1.0 - uncertainty_outputs
+  confidence_outputs = list(confidence_outputs)
+  pitch_outputs = [ float(x) for x in pitch_outputs]
+  indices = range(len (pitch_outputs))
+  confident_pitch_outputs = [ (i,p)
+  for i, p, c in zip(indices, pitch_outputs, confidence_outputs) if  c >= 0.9  ]
+  confident_pitch_outputs_x, confident_pitch_outputs_y = zip(*confident_pitch_outputs)
+  confident_pitch_values_hz = [ output2hz(p) for p in confident_pitch_outputs_y ]
+  #Plot waves
+  fig1 = plt.figure()
+  plt.plot(audio_samples)
+  #Plot
+  fig2 = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE)
+  #Plot Pitch & Confidence
+  fig3 = plot_pitch_conf(pitch_outputs,confidence_outputs)
+  #Plot Pitch & Confidence Notes
+  fig4 = plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y)
+  #Plot Espectro + Notes
+  fig5 = espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz)
+  # ############################################################################
+  # Converting to musical notes ################################################
+  # Now that we have the pitch values, let's convert them to notes!
+  # This  is  part  is  challenging  by itself. We have to take into account two
+  # things:
+  #   1. the rests (when there's no singing)
+  #   2. the size of each note (offsets)
+  # ----------------------------------------------------------------------------
+  ### 1: Adding zeros to the output to indicate when there's no singing
+  pitch_outputs_and_rests = [
+    output2hz(p) if c >= 0.9 else 0
+    for i, p, c in zip(indices, pitch_outputs, confidence_outputs)
+  ]
+  # ----------------------------------------------------------------------------
+  ### 2: Adding note offsets
+  # When  a person  sings freely,  the melody may have an offset to the absolute
+  # pitch values that notes can represent.
+  # Hence, to  convert  predictions  to  notes,  one  needs  to correct for this
+  # possible offset.
+  # This is what the following code computes.
+  A4 = 440
+  C0 = A4 * pow(2, -4.75)
+  note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
+  def hz2offset(freq):
+    # This measures the quantization error for a single note.
+    if freq == 0:  # Rests always have zero error.
+      return None
+    # Quantized note.
+    h = round(12 * math.log2(freq / C0))
+    return 12 * math.log2(freq / C0) - h
+  # The ideal offset is the mean quantization error for all the notes
+  # (excluding rests):
+  offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0]
+  #print("offsets: ", offsets)
+  off = offsets
+  ideal_offset = statistics.mean(offsets)
+  #print("ideal offset: ", ideal_offset)
+  ideal_off  = ideal_offset
+  # We can now use some heuristics to try and estimate the most likely  sequence
+  # of notes that were sung.
+  # The ideal offset computed above is one ingredient - but we also need to know
+  # the speed (how many predictions make, say, an eighth?), and the time  offset
+  # to start quantizing.  To keep it simple, we'll just try different speeds and
+  # time offsets and measure the quantization error, using in the end the values
+  # that minimize this error.
+  def quantize_predictions(group, ideal_offset):
+  # Group values are either 0, or a pitch in Hz.
+    non_zero_values = [v for v in group if v != 0]
+    zero_values_count = len(group) - len(non_zero_values)
+    # Create a rest if 80% is silent, otherwise create a note.
+    if zero_values_count > 0.8 * len(group):
+      # Interpret as a rest. Count each dropped note as an error, weighted a bit
+      # worse than a badly sung note (which would 'cost' 0.5).
+      return 0.51 * len(non_zero_values), "Rest"
+    else:
+      # Interpret as note, estimating as mean of non-rest predictions.
+      h = round(
+          statistics.mean([
+            12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values
+          ]))
+      octave = h // 12
+      n = h % 12
+      note = note_names[n] + str(octave)
+      # Quantization error is the total difference from the quantized note.
+      error = sum([
+          abs(12 * math.log2(freq / C0) - ideal_offset - h)
+          for freq in non_zero_values
+      ])
+    return error, note
+  def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth,
+                                 prediction_start_offset, ideal_offset):
+    # Apply the start offset - we can just add the offset as rests.
+    pitch_outputs_and_rests = [0] * prediction_start_offset + \
+                              pitch_outputs_and_rests
+    # Collect the predictions for each note (or rest).
+    groups = [
+        pitch_outputs_and_rests[i:i + predictions_per_eighth]
+        for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth)
+    ]
+    quantization_error = 0
+    notes_and_rests = []
+    for group in groups:
+      error, note_or_rest = quantize_predictions(group, ideal_offset)
+      quantization_error += error
+      notes_and_rests.append(note_or_rest)
+    return quantization_error, notes_and_rests
+  best_error = float("inf")
+  best_notes_and_rests = None
+  best_predictions_per_note = None
+  for predictions_per_note in range(20, 65, 1):
+    for prediction_start_offset in range(predictions_per_note):
+      error, notes_and_rests = get_quantization_and_error(
+          pitch_outputs_and_rests, predictions_per_note,
+          prediction_start_offset, ideal_offset)
+      if error < best_error:
+        best_error = error
+        best_notes_and_rests = notes_and_rests
+        best_predictions_per_note = predictions_per_note
+  # At this point, best_notes_and_rests contains the best quantization.
+  # Since we don't need to have rests at the beginning, let's remove these:
+  while best_notes_and_rests[0] == 'Rest':
+    best_notes_and_rests = best_notes_and_rests[1:]
+  # Also remove silence at the end.
+  while best_notes_and_rests[-1] == 'Rest':
+    best_notes_and_rests = best_notes_and_rests[:-1]
+  # ____________________________________________________________________________
+  # Now let's write the quantized notes as sheet music score!
+  # To do it we will use two libraries: [music21](http://web.mit.edu/music21/) and
+  # [Open Sheet Music Display](https://github.com/opensheetmusicdisplay/opensheetmusicdisplay)
+  # **Note:** for simplicity, we assume here that all notes have the same duration
+  # (a half note).
+  # Creating the sheet music score.
+  sc = music21.stream.Score()
+  # Adjust the speed to match the actual singing.
+  bpm = 60 * 60 / best_predictions_per_note
+  #print ('bpm: ', bpm)
+  a = music21.tempo.MetronomeMark(number=bpm)
+  sc.insert(0,a)
+  for snote in best_notes_and_rests:
+      d = 'half'
+      if snote == 'Rest':
+        sc.append(music21.note.Rest(type=d))
+      else:
+        sc.append(music21.note.Note(snote, type=d))
+  # @title [Run this] Helper  function to use Open Sheet Music Display (JS code)
+  # to show a music score
+  from IPython.core.display import display, HTML, Javascript
+  import json, random
+  def showScore(score):
+      xml = open(score.write('musicxml')).read()
+      showMusicXML(xml)
+  def showMusicXML(xml):
+      DIV_ID = "OSMD_div"
+      a = display(HTML('<div id="'+DIV_ID+'">loading OpenSheetMusicDisplay</div>'))
+      script = """
+      var div_id = {{DIV_ID}};
+      function loadOSMD() {
+          return new Promise(function(resolve, reject){
+              if (window.opensheetmusicdisplay) {
+                  return resolve(window.opensheetmusicdisplay)
+              }
+              // OSMD script has a 'define' call which conflicts with requirejs
+              var _define = window.define // save the define object
+              window.define = undefined // now the loaded script will ignore requirejs
+              var s = document.createElement( 'script' );
+              s.setAttribute( 'src', "https://cdn.jsdelivr.net/npm/opensheetmusicdisplay@0.7.6/build/opensheetmusicdisplay.min.js" );
+              //s.setAttribute( 'src', "/custom/opensheetmusicdisplay.js" );
+              s.onload=function(){
+                  window.define = _define
+                  resolve(opensheetmusicdisplay);
+              };
+              document.body.appendChild( s ); // browser will try to load the new script tag
+          })
+      }
+      loadOSMD().then((OSMD)=>{
+          window.openSheetMusicDisplay = new OSMD.OpenSheetMusicDisplay(div_id, {
+            drawingParameters: "compacttight"
+          });
+          openSheetMusicDisplay
+              .load({{data}})
+              .then(
+                function() {
+                  openSheetMusicDisplay.render();
+                }
+              );
+      })
+      """.replace('{{DIV_ID}}',DIV_ID).replace('{{data}}',json.dumps(xml))
+      #display(Javascript(script))
+      return a
+  # rendering the music score
+  partitura = showScore(sc)
+  #print(best_notes_and_rests)
+  # ____________________________________________________________________________
+  # Let's convert the music notes to a MIDI file and listen to it.
+  # To create this file, we can use the stream we created before.
+  # Saving the recognized musical notes as a MIDI file
+  converted_audio_file_as_midi = converted_audio_file[:-4] + '.mid'
+  fp = sc.write('midi', fp=converted_audio_file_as_midi)
+  wav_from_created_midi = converted_audio_file_as_midi.replace(' ', '_') + "_midioutput.wav"
+  #print(wav_from_created_midi)
+  # To listen to it on colab, we need to convert it back to wav. An easy way  of
+  # doing that is using Timidity.
+  !timidity $converted_audio_file_as_midi -Ow -o $wav_from_created_midi
+  return converted_audio_file, fig1, fig2, fig3, fig4,fig5, bpm, best_notes_and_rests, partitura, wav_from_created_midi
+iface = gr.Interface(
+    fn=main,
+    inputs = [gr.inputs.Audio(source= "microphone" , type="filepath",label="Ingrese Audio")],
+    outputs= [gr.outputs.Audio(label="Audio Original"),
+              gr.outputs.Plot(type="auto",label="Gráfico de Frecuencias"),
+              gr.outputs.Plot(type="auto",label="Especto"),
+              gr.outputs.Plot(type="auto",label="Pitch Confidence"),
+              gr.outputs.Plot(type="auto",label="Notas"),
+              gr.outputs.Plot(type="auto",label="Espectro+Notas"),
+              gr.outputs.Textbox(label="bpm"),
+              gr.outputs.Textbox(label="partitura"),
+              gr.outputs.Textbox(type="html",label="partitura1"),
+              gr.outputs.Audio(label="midi")],
+    examples=[[uploaded_file_name]],
+    interpretation = "default",
+)
+iface.launch(debug=True)