Spaces:
Build error
Build error
Commit
·
d9489e4
1
Parent(s):
7c17274
Add plotting function
Browse files
app.py
CHANGED
@@ -11,6 +11,9 @@ from pydub import AudioSegment
|
|
11 |
from inferencemodel import InferenceModel
|
12 |
from utils import upload_audio
|
13 |
|
|
|
|
|
|
|
14 |
SAMPLE_RATE = 16000
|
15 |
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
|
16 |
|
@@ -20,13 +23,14 @@ current_model = "mt3"
|
|
20 |
|
21 |
def change_model(model):
|
22 |
global current_model
|
|
|
23 |
checkpoint_path = f"/home/user/app/checkpoints/{model}/"
|
24 |
if model == current_model:
|
25 |
return
|
26 |
-
global inference_model
|
27 |
inference_model = InferenceModel(checkpoint_path, model)
|
28 |
current_model = model
|
29 |
print("Inferece model", inference_model)
|
|
|
30 |
|
31 |
# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
|
32 |
def get_audio(url):
|
@@ -37,7 +41,6 @@ def get_audio(url):
|
|
37 |
new_file = base + ".wav"
|
38 |
os.rename(out_file, new_file)
|
39 |
a = new_file
|
40 |
-
print("file a is:", a)
|
41 |
wav_to_cut = AudioSegment.from_file(a)
|
42 |
# pydub does things in milliseconds
|
43 |
ten_seconds = 10 * 1000
|
@@ -53,17 +56,17 @@ def populate_metadata(link):
|
|
53 |
return yt.thumbnail_url, yt.title, audio
|
54 |
|
55 |
def inference(yt_audio):
|
56 |
-
with open(yt_audio, "rb") as fd:
|
57 |
contents = fd.read()
|
58 |
|
59 |
-
audio = upload_audio(contents,sample_rate=
|
60 |
|
61 |
est_ns = inference_model(audio)
|
62 |
|
63 |
note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
|
64 |
note_sequence = note_seq.midi_to_note_sequence("./transcribed.mid")
|
65 |
synth = note_seq.midi_synth.fluidsynth
|
66 |
-
array_of_floats = synth(note_sequence, sample_rate=
|
67 |
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
|
68 |
# piano_roll = create_image_from_note_sequence(note_sequence)
|
69 |
|
|
|
11 |
from inferencemodel import InferenceModel
|
12 |
from utils import upload_audio
|
13 |
|
14 |
+
import nest_asyncio
|
15 |
+
nest_asyncio.apply()
|
16 |
+
|
17 |
SAMPLE_RATE = 16000
|
18 |
SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2"
|
19 |
|
|
|
23 |
|
24 |
def change_model(model):
|
25 |
global current_model
|
26 |
+
global inference_model
|
27 |
checkpoint_path = f"/home/user/app/checkpoints/{model}/"
|
28 |
if model == current_model:
|
29 |
return
|
|
|
30 |
inference_model = InferenceModel(checkpoint_path, model)
|
31 |
current_model = model
|
32 |
print("Inferece model", inference_model)
|
33 |
+
print("Current model", current_model)
|
34 |
|
35 |
# Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper
|
36 |
def get_audio(url):
|
|
|
41 |
new_file = base + ".wav"
|
42 |
os.rename(out_file, new_file)
|
43 |
a = new_file
|
|
|
44 |
wav_to_cut = AudioSegment.from_file(a)
|
45 |
# pydub does things in milliseconds
|
46 |
ten_seconds = 10 * 1000
|
|
|
56 |
return yt.thumbnail_url, yt.title, audio
|
57 |
|
58 |
def inference(yt_audio):
|
59 |
+
with open(yt_audio[1], "rb") as fd:
|
60 |
contents = fd.read()
|
61 |
|
62 |
+
audio = upload_audio(contents,sample_rate=SAMPLE_RATE)
|
63 |
|
64 |
est_ns = inference_model(audio)
|
65 |
|
66 |
note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid")
|
67 |
note_sequence = note_seq.midi_to_note_sequence("./transcribed.mid")
|
68 |
synth = note_seq.midi_synth.fluidsynth
|
69 |
+
array_of_floats = synth(note_sequence, sample_rate=44100)
|
70 |
int16_data = note_seq.audio_io.float_samples_to_int16(array_of_floats)
|
71 |
# piano_roll = create_image_from_note_sequence(note_sequence)
|
72 |
|
utils.py
CHANGED
@@ -1,8 +1,14 @@
|
|
1 |
|
2 |
import tempfile
|
|
|
3 |
|
4 |
import librosa
|
5 |
|
|
|
|
|
|
|
|
|
|
|
6 |
class AudioIOReadError(BaseException): # pylint:disable=g-bad-exception-name
|
7 |
pass
|
8 |
|
@@ -51,4 +57,40 @@ def load_audio(audio_filename, sample_rate, duration=10):
|
|
51 |
y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True, duration=duration)
|
52 |
except Exception as e: # pylint: disable=broad-except
|
53 |
raise AudioIOReadError(e)
|
54 |
-
return y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
import tempfile
|
3 |
+
import collections
|
4 |
|
5 |
import librosa
|
6 |
|
7 |
+
import pandas as pd
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
from matplotlib.patches import Rectangle
|
10 |
+
from PIL import Image
|
11 |
+
|
12 |
class AudioIOReadError(BaseException): # pylint:disable=g-bad-exception-name
|
13 |
pass
|
14 |
|
|
|
57 |
y, unused_sr = librosa.load(audio_filename, sr=sample_rate, mono=True, duration=duration)
|
58 |
except Exception as e: # pylint: disable=broad-except
|
59 |
raise AudioIOReadError(e)
|
60 |
+
return y
|
61 |
+
|
62 |
+
# Generate piano_roll
|
63 |
+
def sequence_to_pandas_dataframe(sequence):
|
64 |
+
pd_dict = collections.defaultdict(list)
|
65 |
+
for note in sequence.notes:
|
66 |
+
pd_dict["start_time"].append(note.start_time)
|
67 |
+
pd_dict["end_time"].append(note.end_time)
|
68 |
+
pd_dict["duration"].append(note.end_time - note.start_time)
|
69 |
+
pd_dict["pitch"].append(note.pitch)
|
70 |
+
|
71 |
+
return pd.DataFrame(pd_dict)
|
72 |
+
|
73 |
+
def dataframe_to_pianoroll_img(df):
|
74 |
+
fig = plt.figure(figsize=(8, 5))
|
75 |
+
ax = fig.add_subplot(111)
|
76 |
+
ax.scatter(df.start_time, df.pitch, c="white")
|
77 |
+
for _, row in df.iterrows():
|
78 |
+
ax.add_patch(Rectangle((row["start_time"], row["pitch"]-0.4), row["duration"], 0.4, color="black"))
|
79 |
+
plt.xlabel('time (sec.)', fontsize=18)
|
80 |
+
plt.ylabel('pitch (MIDI)', fontsize=16)
|
81 |
+
return fig
|
82 |
+
|
83 |
+
def fig2img(fig):
|
84 |
+
"""Convert a Matplotlib figure to a PIL Image and return it"""
|
85 |
+
import io
|
86 |
+
buf = io.BytesIO()
|
87 |
+
fig.savefig(buf, format="png")
|
88 |
+
buf.seek(0)
|
89 |
+
img = Image.open(buf)
|
90 |
+
return img
|
91 |
+
|
92 |
+
def create_image_from_note_sequence(sequence):
|
93 |
+
df_sequence = sequence_to_pandas_dataframe(sequence)
|
94 |
+
fig = dataframe_to_pianoroll_img(df_sequence)
|
95 |
+
img = fig2img(fig)
|
96 |
+
return img
|