import os os.system("python3 -m pip install -e .") import gradio as gr import note_seq from pytube import YouTube from pydub import AudioSegment from inferencemodel import InferenceModel from utils import upload_audio SAMPLE_RATE = 16000 SF2_PATH = "SGM-v2.01-Sal-Guit-Bass-V1.3.sf2" # Start inference model inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", "mt3") current_model = "mt3" def change_model(model): global current_model if model == current_model: return global inference_model inference_model = InferenceModel("/home/user/app/checkpoints/mt3/", model) current_model = model print("Inferece model", inference_model) # Credits https://huggingface.co/spaces/rajesh1729/youtube-video-transcription-with-whisper def get_audio(url): yt = YouTube(url) video = yt.streams.filter(only_audio=True).first() out_file = video.download(output_path=".") base, ext = os.path.splitext(out_file) new_file = base + ".wav" os.rename(out_file, new_file) a = new_file print("file a is:", a) wav_to_cut = AudioSegment.from_wav(a) # pydub does things in milliseconds ten_seconds = 10 * 1000 first_10_seconds = wav_to_cut[:ten_seconds] os.remove(new_file) return first_10_seconds # Credits https://huggingface.co/spaces/jeffistyping/Youtube-Whisperer def populate_metadata(link): yt = YouTube(link) audio = get_audio(link) return yt.thumbnail_url, yt.title, audio def inference(audio): with open(audio, "rb") as fd: contents = fd.read() audio = upload_audio(contents,sample_rate=16000) est_ns = inference_model(audio) note_seq.sequence_proto_to_midi_file(est_ns, "./transcribed.mid") return "./transcribed.mid" title = "Transcribe music from YouTube videos using Transformers." description = """ Gradio demo for Music Transcription with Transformers. Read more in the links below. """ article = "
MT3: Multi-Task Multitrack Music Transcription | Github Repo
" # Create a block object demo = gr.Blocks() # Use your Block object as a context with demo: gr.Markdown("